From 6c360dca36e8d6e691527b238d9a22f656ff3ade Mon Sep 17 00:00:00 2001 From: YiyangWu Date: Thu, 19 May 2022 18:37:57 +0800 Subject: [PATCH] gfx1031: initial import These logic files are benchmarked using Tensile-rocm-5.0.1 on my desktop with Radeon RX 6700XT. Bias may exist, but they at least works and provide a decent performance when running rocblas-bench --- .../asm_full/navi22_Cijk_Ailk_Bjlk_HB.yaml | 27119 ++++++ .../asm_full/navi22_Cijk_Ailk_Bjlk_HBH.yaml | 40006 ++++++++ .../navi22_Cijk_Ailk_Bjlk_HBH_GB.yaml | 40898 ++++++++ .../asm_full/navi22_Cijk_Ailk_Bjlk_HB_GB.yaml | 27342 ++++++ .../asm_full/navi22_Cijk_Ailk_Bjlk_SB.yaml | 64545 +++++++++++++ .../asm_full/navi22_Cijk_Ailk_Bjlk_SB_GB.yaml | 69105 ++++++++++++++ .../asm_full/navi22_Cijk_Ailk_Bljk_HB.yaml | 29651 ++++++ .../asm_full/navi22_Cijk_Ailk_Bljk_HBH.yaml | 42985 +++++++++ .../navi22_Cijk_Ailk_Bljk_HBH_GB.yaml | 46330 +++++++++ .../asm_full/navi22_Cijk_Ailk_Bljk_HB_GB.yaml | 24968 +++++ .../asm_full/navi22_Cijk_Ailk_Bljk_SB.yaml | 70006 ++++++++++++++ .../asm_full/navi22_Cijk_Ailk_Bljk_SB_GB.yaml | 73797 ++++++++++++++ .../asm_full/navi22_Cijk_Alik_Bjlk_HB.yaml | 21036 ++++ .../asm_full/navi22_Cijk_Alik_Bjlk_HBH.yaml | 10305 ++ .../navi22_Cijk_Alik_Bjlk_HBH_GB.yaml | 12089 +++ .../asm_full/navi22_Cijk_Alik_Bjlk_HB_GB.yaml | 19698 ++++ .../asm_full/navi22_Cijk_Alik_Bjlk_SB.yaml | 38740 ++++++++ .../asm_full/navi22_Cijk_Alik_Bjlk_SB_GB.yaml | 38963 ++++++++ .../asm_full/navi22_Cijk_Alik_Bljk_HB.yaml | 23271 +++++ .../asm_full/navi22_Cijk_Alik_Bljk_HBH.yaml | 60142 ++++++++++++ .../navi22_Cijk_Alik_Bljk_HBH_GB.yaml | 57466 +++++++++++ .../asm_full/navi22_Cijk_Alik_Bljk_HB_GB.yaml | 25055 +++++ .../asm_full/navi22_Cijk_Alik_Bljk_SB.yaml | 79574 ++++++++++++++++ .../asm_full/navi22_Cijk_Alik_Bljk_SB_GB.yaml | 78982 +++++++++++++++ 24 files changed, 1022073 insertions(+) create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB_GB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB.yaml create mode 100644 library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB_GB.yaml diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB.yaml new file mode 100644 index 000000000..1f6956328 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB.yaml @@ -0,0 +1,27119 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 4288] + - [26, 24299.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 5888] + - [0, 23774.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 5056] + - [28, 24697.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1024] + - [3, 24201.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 1856] + - [15, 24234.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 704] + - [8, 21290.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 2944] + - [15, 25538.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 4288] + - [2, 22301.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 5056] + - [13, 25026.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 5888] + - [17, 24455.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3584] + - [11, 19751.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1408] + - [26, 24521.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 1856] + - [15, 24321.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 6784] + - [15, 25266.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 5056] + - [17, 24882.0] + - - [448, 5056, 1, 256, 448, 448, 448, 5056] + - [19, 18269.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 448] + - [25, 21757.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 704] + - [25, 23039.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 1024] + - [25, 22003.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 4288] + - [15, 25054.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 2368] + - [2, 22668.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 2944] + - [3, 25471.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 1024] + - [0, 23155.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 2944] + - [12, 22782.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 5056] + - [17, 25383.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 5056] + - [12, 24021.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 3584] + - [17, 23100.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 2944] + - [9, 24080.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 4288] + - [15, 24045.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 4288] + - [32, 24813.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 704] + - [2, 21027.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 4288] + - [24, 24936.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 2368] + - [26, 24259.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 448] + - [2, 22803.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 2944] + - [0, 23186.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 2368] + - [15, 24297.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 3584] + - [16, 19371.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 5888] + - [32, 25185.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 1408] + - [20, 20348.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 2368] + - [2, 22820.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 704] + - [2, 23408.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 1856] + - [2, 20162.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 1856] + - [14, 23498.0] + - - [704, 5888, 1, 256, 704, 704, 704, 5888] + - [12, 20783.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 6784] + - [26, 25054.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 704] + - [2, 22576.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 1408] + - [2, 18401.0] + - - [448, 4288, 1, 256, 448, 448, 448, 4288] + - [27, 16199.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 2368] + - [16, 19602.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 2368] + - [25, 22236.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1024] + - [2, 21489.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 704] + - [25, 21301.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 3584] + - [7, 21844.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 4288] + - [15, 24967.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1856] + - [12, 24037.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 1024] + - [5, 24811.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 3584] + - [23, 24137.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3584] + - [26, 24892.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 2944] + - [30, 24827.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 2368] + - [2, 21539.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 2368] + - [3, 23875.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 2368] + - [25, 21382.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 6784] + - [15, 25396.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 1856] + - [26, 24470.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 4288] + - [26, 24776.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 5056] + - [26, 24562.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 5888] + - [5, 25724.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 5056] + - [23, 24245.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 5056] + - [15, 24253.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 1024] + - [4, 22024.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 1408] + - [27, 21180.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 448] + - [14, 22269.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 5888] + - [5, 22674.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 6784] + - [15, 24192.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 2368] + - [23, 24450.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 2944] + - [3, 24776.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 1024] + - [5, 22732.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 5056] + - [28, 24952.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 1856] + - [2, 22718.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 2368] + - [14, 21024.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 4288] + - [9, 24619.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 6784] + - [12, 22762.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 5888] + - [24, 25734.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1024] + - [5, 24035.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 5888] + - [2, 20103.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 5888] + - [17, 25253.0] + - - [448, 6784, 1, 256, 448, 448, 448, 6784] + - [12, 18179.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 5888] + - [26, 24729.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 3584] + - [19, 23951.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 2944] + - [26, 25590.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 5056] + - [28, 25228.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [15, 25212.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 5888] + - [26, 24922.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 4288] + - [30, 23836.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1856] + - [15, 23478.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 2944] + - [28, 23351.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 6784] + - [2, 22345.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 1024] + - [10, 21728.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 1856] + - [3, 24220.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 1408] + - [31, 20396.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 256] + - [4, 23345.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 3584] + - [12, 24441.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1024] + - [16, 22937.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1856] + - [14, 22040.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 6784] + - [15, 25563.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 1024] + - [16, 23206.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 1024] + - [21, 22661.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 5888] + - [13, 25376.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 1024] + - [4, 19384.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 1408] + - [15, 24893.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 1024] + - [5, 23986.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 1408] + - [3, 24296.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 4288] + - [5, 24593.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 2944] + - [30, 24175.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 1856] + - [2, 21431.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3584] + - [5, 24396.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 6784] + - [23, 23758.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1408] + - [7, 24584.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 5888] + - [28, 24814.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 5056] + - [30, 22918.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 6784] + - [23, 25210.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 4288] + - [0, 23644.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1856] + - [12, 24007.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 5056] + - [0, 24562.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 448] + - [25, 18942.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [21, 24971.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [14, 20338.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3584] + - [21, 24329.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 256] + - [4, 22720.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 3584] + - [23, 24168.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 4288] + - [26, 24958.0] + - - [704, 5056, 1, 256, 704, 704, 704, 5056] + - [2, 19749.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 2368] + - [12, 24111.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 3584] + - [30, 24806.0] + - - [704, 6784, 1, 256, 704, 704, 704, 6784] + - [18, 20425.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3584] + - [5, 23499.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 2944] + - [3, 24916.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 6784] + - [0, 24544.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 4288] + - [19, 24318.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 256] + - [14, 22075.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 704] + - [2, 21166.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 6784] + - [21, 24767.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 1856] + - [8, 23126.0] + - - [704, 4288, 1, 256, 704, 704, 704, 4288] + - [2, 19605.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 6784] + - [26, 24571.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 3584] + - [28, 24976.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 2368] + - [26, 24489.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 4288] + - [14, 23572.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 3584] + - [28, 24109.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1408] + - [2, 21524.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 6784] + - [3, 24586.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 704] + - [2, 23179.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 4288] + - [17, 23753.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 5888] + - [12, 23729.0] + - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [13, 25469.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 704] + - [2, 22292.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 448] + - [25, 22053.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 2368] + - [4, 20444.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 5056] + - [5, 21126.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 448] + - [14, 17728.0] + - - [448, 5888, 1, 256, 448, 448, 448, 5888] + - [14, 17359.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 2368] + - [31, 23641.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 704] + - [14, 23424.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 2944] + - [3, 24299.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 704] + - [27, 15972.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 2368] + - [2, 22660.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 5056] + - [13, 25259.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3584] + - [17, 25092.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 6784] + - [7, 25206.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 2944] + - [15, 24559.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 704] + - [2, 22721.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 5056] + - [16, 21400.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 5888] + - [17, 25647.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 4288] + - [15, 24904.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1856] + - [14, 23310.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 448] + - [2, 22537.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 5888] + - [5, 22194.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 6784] + - [26, 24414.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 2944] + - [16, 21495.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 2944] + - [15, 25186.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 1408] + - [2, 21910.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 4288] + - [16, 19359.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 704] + - [2, 19657.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 1408] + - [9, 23973.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1024] + - [4, 21705.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 6784] + - [7, 23497.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 448] + - [2, 22583.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 3584] + - [28, 24867.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [17, 24391.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 1024] + - [9, 24439.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 6784] + - [19, 22087.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 2944] + - [25, 22910.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 5056] + - [28, 21787.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 5888] + - [3, 24083.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 1856] + - [25, 21781.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 5056] + - [12, 24219.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 5056] + - [13, 25393.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 6784] + - [3, 25297.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 5888] + - [7, 23240.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 4288] + - [5, 24772.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1408] + - [0, 24040.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 2368] + - [20, 22943.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 5056] + - [15, 25099.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 2368] + - [3, 24584.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 1856] + - [15, 23663.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 2944] + - [23, 22992.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 5888] + - [26, 25472.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1024] + - [28, 23111.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 4288] + - [14, 21419.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3584] + - [15, 25402.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3584] + - [28, 25290.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1408] + - [2, 22802.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 2944] + - [15, 25055.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 6784] + - [7, 23314.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 2944] + - [19, 23959.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1408] + - [3, 23955.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 6784] + - [13, 25398.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 4288] + - [30, 23632.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 6784] + - [15, 24476.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 1408] + - [4, 23098.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 5888] + - [7, 25247.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1024] + - [5, 23717.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 6784] + - [3, 20736.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1408] + - [3, 23808.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 1856] + - [2, 23649.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 5888] + - [15, 25338.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1856] + - [15, 24355.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 256] + - [4, 22171.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 5888] + - [15, 24279.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 1408] + - [25, 22897.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3584] + - [2, 21359.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 448] + - [25, 21935.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 704] + - [25, 19866.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 1024] + - [2, 20224.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 4288] + - [15, 23938.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 5056] + - [0, 24668.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 5056] + - [13, 25259.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 5888] + - [19, 24284.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 6784] + - [0, 24904.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 2368] + - [2, 22768.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 3584] + - [23, 24149.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 3584] + - [28, 25184.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [16, 19829.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 1408] + - [2, 21976.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 4288] + - [21, 23656.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 2368] + - [15, 24396.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1856] + - [4, 22430.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 1856] + - [25, 23697.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 5888] + - [7, 23169.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 2368] + - [14, 21908.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 704] + - [2, 22788.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 2944] + - [26, 24772.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 256] + - [2, 22371.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 5056] + - [23, 22583.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [4, 17045.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 4288] + - [19, 22361.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 2368] + - [12, 24542.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 5888] + - [15, 24620.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 256] + - [16, 21811.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 1856] + - [14, 22773.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 704] + - [25, 19154.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 6784] + - [3, 24462.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 4288] + - [23, 23442.0] + - - [704, 3584, 1, 256, 704, 704, 704, 3584] + - [8, 19621.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 2944] + - [4, 21830.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 5056] + - [11, 24261.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 5056] + - [15, 25137.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 1024] + - [2, 23019.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 6784] + - [7, 23838.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 1408] + - [3, 24709.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 2368] + - [2, 22903.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 2944] + - [3, 25491.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 448] + - [6, 21675.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 6784] + - [3, 25590.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 5056] + - [4, 20346.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 704] + - [25, 21122.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 5888] + - [27, 21837.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 4288] + - [12, 24386.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [16, 19891.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1024] + - [5, 24326.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 704] + - [14, 22501.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 3584] + - [3, 25327.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 2944] + - [15, 24013.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 2368] + - [2, 23064.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 3584] + - [2, 20897.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 2944] + - [4, 22154.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 5888] + - [19, 24749.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 1856] + - [31, 23125.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 4288] + - [2, 20918.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 2944] + - [1, 24446.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 5056] + - [17, 24942.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 5056] + - [4, 22564.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 2368] + - [12, 23174.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 704] + - [2, 22905.0] + - - [448, 3584, 1, 256, 448, 448, 448, 3584] + - [16, 15665.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 1408] + - [14, 22481.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [28, 25388.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 3584] + - [30, 24643.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 1856] + - [2, 23485.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1408] + - [26, 24607.0] + - - [704, 2944, 1, 256, 704, 704, 704, 2944] + - [16, 19507.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 5888] + - [15, 25551.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 6784] + - [3, 24755.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1408] + - [2, 21644.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 4288] + - [22, 19992.0] + - - [704, 2368, 1, 256, 704, 704, 704, 2368] + - [16, 16351.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 2368] + - [15, 24697.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 5056] + - [19, 23902.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 448] + - [2, 22144.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 704] + - [8, 23325.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3584] + - [34, 25009.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 6784] + - [15, 25631.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 5056] + - [5, 22428.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 2944] + - [26, 24394.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 3584] + - [0, 23242.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 2368] + - [26, 24121.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 1856] + - [2, 20957.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 1408] + - [26, 24398.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 5056] + - [28, 24784.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 6784] + - [1, 25725.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 1408] + - [3, 24682.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [14, 19770.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 1024] + - [25, 17952.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 5056] + - [26, 22500.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 5056] + - [14, 21828.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 1408] + - [14, 22793.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 448] + - [2, 19438.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 5056] + - [17, 25255.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 6784] + - [3, 24913.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 1408] + - [30, 23612.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 4288] + - [30, 24315.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 5888] + - [12, 24758.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 1024] + - [2, 20651.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 6784] + - [3, 24369.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 3584] + - [23, 24086.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1856] + - [12, 23956.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 1024] + - [2, 22491.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 3584] + - [15, 25354.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 2944] + - [14, 21237.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 6784] + - [23, 20499.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 1024] + - [5, 23515.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1856] + - [2, 23451.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3584] + - [15, 24335.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 5888] + - [3, 25402.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 3584] + - [23, 24654.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 5888] + - [3, 24129.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 448] + - [2, 23098.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 1408] + - [29, 22074.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 2368] + - [14, 23769.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 5056] + - [23, 24243.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 2368] + - [25, 23794.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 1856] + - [2, 22754.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 2944] + - [7, 22196.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1408] + - [2, 21857.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 4288] + - [25, 23209.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 5056] + - [17, 24527.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 1856] + - [4, 23066.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 3584] + - [14, 20713.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 6784] + - [12, 24372.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 2944] + - [21, 24867.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 2944] + - [23, 24799.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 4288] + - [12, 24127.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 1024] + - [17, 24545.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 5888] + - [13, 25659.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 5888] + - [2, 19667.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 1408] + - [26, 22256.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 2944] + - [2, 22313.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 5888] + - [5, 24761.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1856] + - [25, 22802.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 4288] + - [3, 25028.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 704] + - [2, 23069.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 448] + - [2, 19446.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 5056] + - [23, 24043.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1024] + - [14, 22787.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 4288] + - [20, 22822.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 2368] + - [2, 23264.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 5888] + - [15, 25012.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 6784] + - [21, 25440.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 6784] + - [2, 22071.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 3584] + - [33, 21350.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 5888] + - [10, 21188.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [15, 25555.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 2368] + - [2, 22648.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 1856] + - [14, 20721.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3584] + - [26, 25399.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 5888] + - [3, 24398.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 2368] + - [14, 22913.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 2944] + - [19, 23858.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [14, 20376.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 1408] + - [23, 23671.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 6784] + - [24, 25777.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 4288] + - [14, 20501.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 448] + - [2, 23172.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [29, 15680.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [7, 21014.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [7, 20833.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [7, 18545.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [27, 18005.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [23, 18126.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [0, 21978.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 14123.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [23, 21610.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [23, 21311.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [23, 21610.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [16, 17733.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [0, 18463.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [0, 18961.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 19025.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 1024] + - [37, 20343.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [52, 12029.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 6784] + - [38, 20536.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 4288] + - [36, 22058.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 1856] + - [44, 21429.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1024] + - [52, 16275.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 128] + - [40, 7983.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1856] + - [46, 16663.0] + - - [448, 704, 1, 1280, 448, 448, 448, 704] + - [57, 14408.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 3584] + - [46, 16177.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [35, 15647.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [46, 15399.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 256] + - [46, 20066.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 4288] + - [46, 21028.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 448] + - [37, 19233.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 128] + - [38, 18864.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 128] + - [53, 15961.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 128] + - [35, 8851.0] + - - [448, 1408, 1, 256, 448, 448, 448, 1408] + - [40, 12403.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [39, 16900.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 256] + - [53, 17106.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3584] + - [38, 21149.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [53, 14636.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 704] + - [52, 12888.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 128] + - [36, 15795.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [40, 8507.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 256] + - [37, 16866.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [53, 14937.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 1408] + - [36, 14298.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 128] + - [43, 20019.0] + - - [704, 704, 1, 3328, 704, 704, 704, 704] + - [38, 15730.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 256] + - [47, 21102.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3584] + - [46, 18515.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 2944] + - [41, 14245.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1856] + - [52, 18401.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 128] + - [52, 12441.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 1408] + - [36, 19656.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [40, 16494.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 2944] + - [37, 19932.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 704] + - [58, 15939.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 2944] + - [45, 20732.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [35, 8245.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 2368] + - [53, 19460.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 4288] + - [46, 17707.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [40, 11956.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 448] + - [45, 18034.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 704] + - [36, 21112.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [37, 19362.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [35, 13125.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 6784] + - [51, 19445.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 704] + - [53, 19599.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 5888] + - [54, 18250.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 1408] + - [53, 19603.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 128] + - [48, 13655.0] + - - [704, 448, 1, 256, 704, 704, 704, 448] + - [40, 8139.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 1856] + - [37, 18629.0] + - - [128, 4288, 1, 256, 128, 128, 128, 4288] + - [58, 12197.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [49, 16959.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 256] + - [50, 18886.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 704] + - [35, 17101.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 448] + - [54, 19748.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [35, 12086.0] + - - [704, 1856, 1, 256, 704, 704, 704, 1856] + - [40, 19158.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [37, 13398.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 2368] + - [37, 17445.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 448] + - [40, 16551.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 128] + - [36, 13777.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [45, 14761.0] + - - [704, 448, 1, 3328, 704, 704, 704, 448] + - [36, 16592.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [55, 16959.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 448] + - [35, 19527.0] + - - [128, 3584, 1, 256, 128, 128, 128, 3584] + - [40, 10411.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 448] + - [45, 18771.0] + - - [128, 5056, 1, 256, 128, 128, 128, 5056] + - [44, 12438.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 256] + - [37, 19718.0] + - - [704, 704, 1, 256, 704, 704, 704, 704] + - [35, 10022.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 128] + - [47, 17420.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1408] + - [52, 12960.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [54, 17055.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 2944] + - [37, 14951.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 448] + - [45, 20759.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 448] + - [54, 20193.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [35, 7756.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 5056] + - [47, 20346.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [52, 9682.0] + - - [128, 2368, 1, 256, 128, 128, 128, 2368] + - [56, 7653.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [44, 16266.0] + - - [128, 2944, 1, 256, 128, 128, 128, 2944] + - [48, 10578.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 128] + - [45, 18355.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 448] + - [45, 17160.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 128] + - [46, 19568.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 704] + - [53, 19282.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1408] + - [52, 17974.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1408] + - [37, 18626.0] + - - [448, 2944, 1, 256, 448, 448, 448, 2944] + - [35, 16665.0] + - - [448, 2368, 1, 256, 448, 448, 448, 2368] + - [52, 15021.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [48, 7642.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 128] + - [59, 20654.0] + - - [448, 704, 1, 256, 448, 448, 448, 704] + - [56, 8446.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 256] + - [49, 18829.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 128] + - [45, 14923.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [53, 15004.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1024] + - [37, 17280.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [42, 15323.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 2368] + - [47, 19591.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 256] + - [35, 12924.0] + - - [704, 448, 1, 1280, 704, 704, 704, 448] + - [36, 14245.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 704] + - [42, 18193.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [46, 12412.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1856] + - [35, 21316.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 256] + - [36, 14287.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 128] + - [40, 13019.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 448] + - [45, 16910.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 256] + - [46, 20838.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [35, 13343.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 448] + - [44, 12460.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [42, 15168.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 1024] + - [45, 17852.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 448] + - [52, 19922.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 128] + - [35, 12532.0] + - - [448, 1024, 1, 256, 448, 448, 448, 1024] + - [56, 10411.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 5056] + - [37, 18482.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [35, 8328.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 5888] + - [46, 19546.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 128] + - [36, 16062.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 128] + - [35, 10808.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 256] + - [47, 19119.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 128] + - [37, 18120.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 3584] + - [47, 20691.0] + - - [128, 5888, 1, 256, 128, 128, 128, 5888] + - [35, 12966.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] + - [35, 14864.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [44, 18944.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [36, 13339.0] + - - [704, 1024, 1, 256, 704, 704, 704, 1024] + - [40, 12780.0] + - - [704, 704, 1, 1280, 704, 704, 704, 704] + - [35, 14530.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 2368] + - [45, 14968.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 256] + - [43, 20117.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 128] + - [37, 18982.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 1856] + - [38, 18751.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 448] + - [44, 10248.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 128] + - [35, 13406.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 2944] + - [46, 19103.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 1024] + - [54, 18682.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 448] + - [49, 17609.0] + - - [128, 6784, 1, 256, 128, 128, 128, 6784] + - [44, 13440.0] + - - [704, 1408, 1, 256, 704, 704, 704, 1408] + - [44, 14434.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [35, 8052.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 2944] + - [53, 20399.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [35, 10764.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 2368] + - [54, 15981.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 2368] + - [36, 19781.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 448] + - [44, 13593.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 704] + - [46, 18823.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 4288] + - [54, 15859.0] + - - [448, 704, 1, 3328, 448, 448, 448, 704] + - [36, 16483.0] + - - [448, 1856, 1, 256, 448, 448, 448, 1856] + - [52, 13524.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 704] + - [45, 21358.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [48, 17177.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [52, 19660.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [55, 16714.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [35, 17533.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [56, 20532.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [35, 14260.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [64, 11354.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [62, 6084.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [62, 5715.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [80, 10996.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [109, 8881.0] + - - [704, 128, 1, 1280, 704, 704, 704, 128] + - [73, 6135.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [67, 11697.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 128] + - [96, 7105.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [77, 9836.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [81, 11503.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [86, 6876.0] + - - [448, 448, 1, 256, 448, 448, 448, 448] + - [107, 6220.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 1024] + - [81, 9341.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [113, 8263.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 128] + - [90, 8089.0] + - - [448, 256, 1, 3328, 448, 448, 448, 256] + - [105, 9210.0] + - - [128, 704, 1, 1280, 128, 128, 128, 704] + - [62, 6039.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 128] + - [64, 11896.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [64, 3716.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [96, 6890.0] + - - [448, 448, 1, 3328, 448, 448, 448, 448] + - [62, 11722.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 128] + - [103, 9525.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1856] + - [97, 10806.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [83, 7297.0] + - - [256, 448, 1, 3328, 256, 256, 256, 448] + - [67, 9202.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [102, 9246.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [81, 5013.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [76, 11924.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1024] + - [75, 8105.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [89, 4842.0] + - - [704, 128, 1, 256, 704, 704, 704, 128] + - [72, 3126.0] + - - [448, 256, 1, 1280, 448, 448, 448, 256] + - [62, 7694.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 128] + - [70, 11484.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [67, 7626.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [64, 3949.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1024] + - [98, 10230.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [90, 10324.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 128] + - [62, 10764.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [62, 11374.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [95, 6687.0] + - - [128, 1024, 1, 256, 128, 128, 128, 1024] + - [81, 4313.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [62, 7907.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [85, 6175.0] + - - [704, 128, 1, 3328, 704, 704, 704, 128] + - [80, 7666.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 128] + - [86, 5428.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [62, 9733.0] + - - [704, 256, 1, 1280, 704, 704, 704, 256] + - [62, 9362.0] + - - [256, 448, 1, 1280, 256, 256, 256, 448] + - [88, 7606.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [81, 11299.0] + - - [256, 704, 1, 3328, 256, 256, 256, 704] + - [111, 10612.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [111, 11106.0] + - - [128, 1408, 1, 256, 128, 128, 128, 1408] + - [108, 6969.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 1408] + - [62, 10556.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [114, 4402.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [75, 6978.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 128] + - [103, 9968.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [102, 9299.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [60, 7125.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [107, 10448.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [92, 7658.0] + - - [448, 448, 1, 1280, 448, 448, 448, 448] + - [100, 10384.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [78, 10211.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [81, 11850.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 1024] + - [62, 11015.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [83, 9512.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [70, 3915.0] + - - [128, 704, 1, 256, 128, 128, 128, 704] + - [61, 3178.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 128] + - [81, 4392.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [90, 10426.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [97, 6658.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [67, 9516.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [100, 11005.0] + - - [128, 704, 1, 3328, 128, 128, 128, 704] + - [67, 7325.0] + - - [128, 1856, 1, 256, 128, 128, 128, 1856] + - [71, 6818.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [60, 7380.0] + - - [704, 256, 1, 3328, 704, 704, 704, 256] + - [80, 10616.0] + - - [256, 704, 1, 1280, 256, 256, 256, 704] + - [62, 9423.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [99, 3151.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1408] + - [100, 9400.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 1856] + - [88, 12056.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [86, 5532.0] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [84, 3469.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [79, 5699.0] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [101, 4396.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [68, 428.0] + - - [128, 448, 1, 256, 128, 128, 128, 448] + - [112, 2353.0] + - - [256, 256, 1, 3328, 256, 256, 256, 256] + - [69, 6147.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [68, 2849.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [106, 541.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [84, 1664.0] + - - [128, 448, 1, 1280, 128, 128, 128, 448] + - [79, 4779.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [66, 4056.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [110, 1441.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [66, 3537.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [84, 1081.0] + - - [128, 128, 1, 3328, 128, 128, 128, 128] + - [68, 2352.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [63, 1427.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [66, 3542.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [74, 713.0] + - - [256, 128, 1, 1280, 256, 256, 256, 128] + - [106, 3202.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [84, 831.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [79, 2542.0] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [87, 2695.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [66, 4972.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [104, 522.0] + - - [448, 128, 1, 256, 448, 448, 448, 128] + - [101, 2345.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [61, 2581.0] + - - [448, 128, 1, 3328, 448, 448, 448, 128] + - [106, 5620.0] + - - [128, 256, 1, 1280, 128, 128, 128, 256] + - [68, 3251.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [84, 1641.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [84, 2135.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [74, 1232.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [91, 248.0] + - - [128, 256, 1, 3328, 128, 128, 128, 256] + - [65, 4039.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [93, 823.0] + - - [128, 128, 1, 1280, 128, 128, 128, 128] + - [68, 1638.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [74, 1362.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [84, 532.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [99, 2898.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [84, 1072.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [74, 1215.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [82, 1062.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [61, 2573.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [101, 352.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [90, 4860.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [94, 6546.0] + - - [448, 128, 1, 1280, 448, 448, 448, 128] + - [66, 4766.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [94, 6120.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [84, 2127.0] + - - [256, 256, 1, 1280, 256, 256, 256, 256] + - [64, 4888.0] + - - [256, 128, 1, 3328, 256, 256, 256, 128] + - [68, 4018.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [72, 699.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [104, 4946.0] + - - [128, 448, 1, 3328, 128, 128, 128, 448] + - [66, 5626.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [68, 2130.0] + - - [128, 128, 1, 256, 128, 128, 128, 128] + - [89, 1128.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH.yaml new file mode 100644 index 000000000..d139a479a --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH.yaml @@ -0,0 +1,40006 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 +- [2, 3, 0, 1] +- - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [19, 23743.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [14, 23504.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [14, 23722.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [24, 23269.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [24, 22755.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [14, 23833.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [0, 14948.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [9, 18660.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [2, 19341.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [2, 17344.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [9, 16900.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [14, 16702.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [2, 20273.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 15469.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [14, 19934.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [14, 18990.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [9, 19215.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [2, 16448.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [19, 16957.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [2, 17659.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 18093.0] + - - [1024, 4096, 1, 2048, 1024, 1024, 1024, 4096] + - [2, 22829.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [2, 22959.0] + - - [1024, 30528, 1, 2048, 1024, 1024, 1024, 30528] + - [2, 23749.0] + - - [1024, 30528, 1, 4096, 1024, 1024, 1024, 30528] + - [9, 23837.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 1024] + - [2, 22748.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 1024] + - [2, 22984.0] + - - [256, 4864, 1, 8976, 256, 256, 256, 4864] + - [9, 20827.0] + - - [256, 5120, 1, 8976, 256, 256, 256, 5120] + - [29, 22034.0] + - - [256, 5632, 1, 8976, 256, 256, 256, 5632] + - [29, 19751.0] + - - [256, 5888, 1, 8976, 256, 256, 256, 5888] + - [29, 20730.0] + - - [256, 6144, 1, 8976, 256, 256, 256, 6144] + - [19, 21544.0] + - - [256, 7168, 1, 8976, 256, 256, 256, 7168] + - [14, 21324.0] + - - [256, 8192, 1, 8976, 256, 256, 256, 8192] + - [14, 21197.0] + - - [256, 8960, 1, 8976, 256, 256, 256, 8960] + - [2, 22899.0] + - - [256, 9728, 1, 8976, 256, 256, 256, 9728] + - [14, 22058.0] + - - [256, 9984, 1, 8976, 256, 256, 256, 9984] + - [9, 22512.0] + - - [256, 10240, 1, 8976, 256, 256, 256, 10240] + - [19, 23042.0] + - - [256, 10496, 1, 8976, 256, 256, 256, 10496] + - [2, 21388.0] + - - [256, 11008, 1, 8976, 256, 256, 256, 11008] + - [14, 22335.0] + - - [256, 11264, 1, 8976, 256, 256, 256, 11264] + - [2, 22697.0] + - - [256, 11520, 1, 8976, 256, 256, 256, 11520] + - [9, 23217.0] + - - [256, 11776, 1, 8976, 256, 256, 256, 11776] + - [9, 21658.0] + - - [256, 12544, 1, 8976, 256, 256, 256, 12544] + - [19, 22861.0] + - - [256, 12800, 1, 8976, 256, 256, 256, 12800] + - [19, 23345.0] + - - [256, 13312, 1, 8976, 256, 256, 256, 13312] + - [14, 22286.0] + - - [256, 13568, 1, 8976, 256, 256, 256, 13568] + - [14, 22699.0] + - - [256, 14336, 1, 8976, 256, 256, 256, 14336] + - [9, 22064.0] + - - [256, 14848, 1, 8976, 256, 256, 256, 14848] + - [9, 22776.0] + - - [256, 15104, 1, 8976, 256, 256, 256, 15104] + - [14, 23143.0] + - - [256, 15872, 1, 8976, 256, 256, 256, 15872] + - [2, 22613.0] + - - [256, 16128, 1, 8976, 256, 256, 256, 16128] + - [2, 22930.0] + - - [256, 17152, 1, 8976, 256, 256, 256, 17152] + - [9, 22763.0] + - - [256, 17408, 1, 8976, 256, 256, 256, 17408] + - [19, 23046.0] + - - [256, 18688, 1, 8976, 256, 256, 256, 18688] + - [9, 23060.0] + - - [256, 19968, 1, 8976, 256, 256, 256, 19968] + - [29, 23198.0] + - - [256, 20480, 1, 8976, 256, 256, 256, 20480] + - [19, 23699.0] + - - [256, 20992, 1, 8976, 256, 256, 256, 20992] + - [9, 23008.0] + - - [256, 21248, 1, 8976, 256, 256, 256, 21248] + - [19, 23217.0] + - - [256, 22016, 1, 8976, 256, 256, 256, 22016] + - [2, 22847.0] + - - [256, 26112, 1, 8976, 256, 256, 256, 26112] + - [14, 23269.0] + - - [256, 32512, 1, 8976, 256, 256, 256, 32512] + - [14, 23461.0] + - - [256, 32768, 1, 1, 256, 256, 256, 32768] + - [0, 428.0] + - - [256, 33536, 1, 8976, 256, 256, 256, 33536] + - [14, 23372.0] + - - [256, 44505, 1, 8976, 256, 256, 256, 44505] + - [29, 23846.0] + - - [768, 2048, 1, 256, 768, 768, 768, 2048] + - [15, 18220.0] + - - [1600, 1024, 1, 512, 1600, 1600, 1600, 1024] + - [2, 17549.0] + - - [1600, 1024, 1, 960, 1600, 1600, 1600, 1024] + - [2, 18209.0] + - - [2048, 960, 1, 1, 2048, 2048, 2048, 960] + - [1, 287.0] + - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] + - [14, 22153.0] + - - [2048, 2048, 1, 960, 2048, 2048, 2048, 2048] + - [2, 22636.0] + - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] + - [19, 22194.0] + - - [3200, 2048, 1, 1024, 3200, 3200, 3200, 2048] + - [24, 23355.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 4096] + - [19, 23535.0] + - - [1024, 4096, 1, 3840, 1024, 1024, 1024, 4096] + - [2, 23087.0] + - - [1024, 4096, 1, 3968, 1024, 1024, 1024, 4096] + - [9, 22895.0] + - - [1024, 4096, 1, 6528, 1024, 1024, 1024, 4096] + - [2, 23126.0] + - - [1024, 4096, 1, 7104, 1024, 1024, 1024, 4096] + - [24, 22799.0] + - - [1024, 4096, 1, 7200, 1024, 1024, 1024, 4096] + - [24, 22763.0] + - - [1024, 4096, 1, 8064, 1024, 1024, 1024, 4096] + - [21, 22159.0] + - - [1024, 4096, 1, 8160, 1024, 1024, 1024, 4096] + - [26, 22255.0] + - - [1024, 4096, 1, 9216, 1024, 1024, 1024, 4096] + - [19, 23193.0] + - - [1024, 4096, 1, 9520, 1024, 1024, 1024, 4096] + - [19, 23178.0] + - - [1024, 4096, 1, 10064, 1024, 1024, 1024, 4096] + - [29, 23184.0] + - - [1024, 4096, 1, 10080, 1024, 1024, 1024, 4096] + - [9, 23179.0] + - - [1024, 4096, 1, 10200, 1024, 1024, 1024, 4096] + - [29, 23179.0] + - - [1024, 42720, 1, 3968, 1024, 1024, 1024, 42720] + - [2, 24100.0] + - - [1024, 42720, 1, 6528, 1024, 1024, 1024, 42720] + - [2, 24114.0] + - - [1024, 42720, 1, 7104, 1024, 1024, 1024, 42720] + - [9, 24047.0] + - - [1024, 42720, 1, 7200, 1024, 1024, 1024, 42720] + - [9, 24037.0] + - - [1024, 42720, 1, 9520, 1024, 1024, 1024, 42720] + - [14, 23518.0] + - - [1024, 42720, 1, 10080, 1024, 1024, 1024, 42720] + - [2, 23554.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 4096, 1024] + - [2, 22909.0] + - - [4096, 1024, 1, 3968, 4096, 4096, 4096, 1024] + - [14, 22892.0] + - - [4096, 1024, 1, 6528, 4096, 4096, 4096, 1024] + - [2, 23064.0] + - - [4096, 1024, 1, 7104, 4096, 4096, 4096, 1024] + - [16, 22583.0] + - - [4096, 1024, 1, 7200, 4096, 4096, 4096, 1024] + - [16, 22583.0] + - - [4096, 1024, 1, 8064, 4096, 4096, 4096, 1024] + - [19, 23097.0] + - - [4096, 1024, 1, 8160, 4096, 4096, 4096, 1024] + - [19, 23114.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 4096, 1024] + - [19, 23105.0] + - - [4096, 1024, 1, 9520, 4096, 4096, 4096, 1024] + - [19, 23196.0] + - - [4096, 1024, 1, 10064, 4096, 4096, 4096, 1024] + - [9, 23205.0] + - - [4096, 1024, 1, 10080, 4096, 4096, 4096, 1024] + - [24, 23241.0] + - - [4096, 1024, 1, 10200, 4096, 4096, 4096, 1024] + - [9, 23176.0] + - - [1024, 4096, 1, 3240, 1024, 1024, 1024, 4096] + - [14, 22981.0] + - - [1024, 4096, 1, 3960, 1024, 1024, 1024, 4096] + - [24, 22831.0] + - - [1024, 42720, 1, 3960, 1024, 1024, 1024, 42720] + - [9, 24082.0] + - - [4096, 1024, 1, 3240, 4096, 4096, 4096, 1024] + - [2, 22924.0] + - - [4096, 1024, 1, 3960, 4096, 4096, 4096, 1024] + - [2, 23048.0] + - - [1225, 192, 64, 32, 1225, 1225, 1225, 192] + - [13, 9940.0] + - - [1225, 192, 64, 48, 1225, 1225, 1225, 192] + - [13, 14520.0] + - - [1225, 192, 64, 64, 1225, 1225, 1225, 192] + - [23, 16089.0] + - - [1225, 256, 64, 48, 1225, 1225, 1225, 256] + - [13, 15429.0] + - - [1225, 256, 64, 64, 1225, 1225, 1225, 256] + - [25, 16725.0] + - - [1225, 288, 64, 48, 1225, 1225, 1225, 288] + - [8, 12541.0] + - - [1225, 288, 64, 64, 1225, 1225, 1225, 288] + - [13, 14512.0] + - - [289, 768, 64, 128, 289, 289, 289, 768] + - [19, 15414.0] + - - [289, 768, 64, 160, 289, 289, 289, 768] + - [19, 15898.0] + - - [289, 768, 64, 192, 289, 289, 289, 768] + - [9, 16219.0] + - - [1225, 192, 32, 32, 1225, 1225, 1225, 192] + - [18, 10490.0] + - - [1225, 192, 32, 48, 1225, 1225, 1225, 192] + - [28, 14156.0] + - - [1225, 192, 32, 64, 1225, 1225, 1225, 192] + - [23, 16067.0] + - - [1225, 256, 32, 48, 1225, 1225, 1225, 256] + - [23, 14209.0] + - - [1225, 256, 32, 64, 1225, 1225, 1225, 256] + - [13, 16752.0] + - - [1225, 288, 32, 48, 1225, 1225, 1225, 288] + - [18, 12596.0] + - - [1225, 288, 32, 64, 1225, 1225, 1225, 288] + - [23, 14561.0] + - - [289, 768, 32, 128, 289, 289, 289, 768] + - [9, 14735.0] + - - [289, 768, 32, 160, 289, 289, 289, 768] + - [9, 15315.0] + - - [289, 768, 32, 192, 289, 289, 289, 768] + - [19, 15638.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 256] + - [30, 19021.0] + - - [784, 128, 32, 512, 784, 784, 784, 128] + - [14, 18132.0] + - - [784, 512, 32, 128, 784, 784, 784, 512] + - [2, 18256.0] + - - [196, 1024, 32, 256, 196, 196, 196, 1024] + - [19, 15904.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 128] + - [14, 22163.0] + - - [784, 256, 64, 512, 784, 784, 784, 256] + - [2, 20202.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 256] + - [2, 21491.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [2, 22551.0] + - - [196, 512, 64, 1024, 196, 196, 196, 512] + - [14, 17560.0] + - - [784, 512, 64, 256, 784, 784, 784, 512] + - [2, 19838.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [14, 20464.0] + - - [196, 1024, 64, 512, 196, 196, 196, 1024] + - [14, 17319.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [24, 17824.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 128] + - [19, 21943.0] + - - [784, 256, 32, 512, 784, 784, 784, 256] + - [24, 19555.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 256] + - [29, 21033.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [29, 22255.0] + - - [196, 512, 32, 1024, 196, 196, 196, 512] + - [14, 17081.0] + - - [784, 512, 32, 256, 784, 784, 784, 512] + - [2, 19553.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [24, 20224.0] + - - [196, 1024, 32, 512, 196, 196, 196, 1024] + - [14, 16994.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [14, 17478.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [14, 23989.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [2, 23967.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [14, 22744.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 7680] + - [19, 23990.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 3840] + - [2, 23995.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 1920] + - [14, 22821.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 23937.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [2, 23738.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [14, 22328.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 4096] + - [14, 21479.0] + - - [1024, 30522, 1, 77, 1024, 1024, 1024, 30522] + - [23, 17195.0] + - - [4096, 1024, 1, 512, 4096, 4096, 4096, 1024] + - [14, 22194.0] + - - [1024, 4096, 1, 1280, 1024, 1024, 1024, 4096] + - [9, 22751.0] + - - [1024, 30522, 1, 200, 1024, 1024, 1024, 30522] + - [12, 22377.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 4096, 1024] + - [2, 22855.0] + - - [1024, 4096, 1, 4992, 1024, 1024, 1024, 4096] + - [9, 23119.0] + - - [1024, 30522, 1, 780, 1024, 1024, 1024, 30522] + - [24, 23473.0] + - - [4096, 1024, 1, 4992, 4096, 4096, 4096, 1024] + - [9, 23088.0] + - - [1024, 30522, 1, 308, 1024, 1024, 1024, 30522] + - [24, 22726.0] + - - [1024, 4096, 1, 5120, 1024, 1024, 1024, 4096] + - [2, 23119.0] + - - [1024, 30522, 1, 800, 1024, 1024, 1024, 30522] + - [29, 23666.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 4096, 1024] + - [9, 23092.0] + - - [1024, 4096, 1, 5248, 1024, 1024, 1024, 4096] + - [2, 23057.0] + - - [1024, 30522, 1, 820, 1024, 1024, 1024, 30522] + - [9, 23505.0] + - - [4096, 1024, 1, 5248, 4096, 4096, 4096, 1024] + - [9, 23111.0] + - - [1024, 4096, 1, 2560, 1024, 1024, 1024, 4096] + - [29, 22677.0] + - - [1024, 30522, 1, 385, 1024, 1024, 1024, 30522] + - [14, 22919.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 1024] + - [9, 23021.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 4096] + - [2, 22855.0] + - - [1024, 30522, 1, 462, 1024, 1024, 1024, 30522] + - [14, 23039.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 4096, 1024] + - [2, 23000.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 4096] + - [14, 22155.0] + - - [1024, 30522, 1, 160, 1024, 1024, 1024, 30522] + - [14, 21931.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [9, 22596.0] + - - [1024, 4096, 1, 1152, 1024, 1024, 1024, 4096] + - [29, 22717.0] + - - [1024, 30522, 1, 180, 1024, 1024, 1024, 30522] + - [29, 21841.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 4096, 1024] + - [9, 22715.0] + - - [1024, 4096, 1, 8192, 1024, 1024, 1024, 4096] + - [9, 23165.0] + - - [1024, 4096, 1, 9600, 1024, 1024, 1024, 4096] + - [29, 23207.0] + - - [1024, 33712, 1, 8192, 1024, 1024, 1024, 33712] + - [24, 24030.0] + - - [1024, 33712, 1, 9600, 1024, 1024, 1024, 33712] + - [2, 24031.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 4096, 1024] + - [2, 23122.0] + - - [4096, 1024, 1, 9600, 4096, 4096, 4096, 1024] + - [2, 23169.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1600] + - [13, 246.0] + - - [2560, 1920, 1, 2048, 2560, 2560, 2560, 1920] + - [24, 23359.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 3072] + - [9, 22308.0] + - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] + - [19, 23584.0] + - - [2048, 2048, 1, 2, 2048, 2048, 2048, 2048] + - [1, 639.0] + - - [1024, 30592, 1, 2048, 1024, 1024, 1024, 30592] + - [2, 24011.0] + - - [1024, 3072, 1, 16384, 1024, 1024, 1024, 3072] + - [2, 22444.0] + - - [6144, 1536, 1, 4096, 6144, 6144, 6144, 1536] + - [2, 23703.0] + - - [1536, 4608, 1, 8192, 1536, 1536, 1536, 4608] + - [29, 23439.0] + - - [640, 2560, 1, 2048, 640, 640, 640, 2560] + - [9, 21419.0] + - - [1024, 4096, 1, 16384, 1024, 1024, 1024, 4096] + - [29, 23227.0] + - - [1536, 6144, 1, 4096, 1536, 1536, 1536, 6144] + - [2, 23736.0] + - - [1024, 30592, 1, 4096, 1024, 1024, 1024, 30592] + - [29, 24106.0] + - - [2560, 2560, 1, 4, 2560, 2560, 2560, 2560] + - [15, 1736.0] + - - [1536, 1536, 1, 4096, 1536, 1536, 1536, 1536] + - [2, 20883.0] + - - [2560, 7680, 1, 2048, 2560, 2560, 2560, 7680] + - [24, 23986.0] + - - [1536, 50304, 1, 4096, 1536, 1536, 1536, 50304] + - [9, 24179.0] + - - [2048, 8192, 1, 1024, 2048, 2048, 2048, 8192] + - [14, 23492.0] + - - [1024, 30592, 1, 8192, 1024, 1024, 1024, 30592] + - [24, 24092.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 4096, 1024] + - [24, 23297.0] + - - [8192, 2048, 1, 1024, 8192, 8192, 8192, 2048] + - [19, 23609.0] + - - [1024, 50304, 1, 4096, 1024, 1024, 1024, 50304] + - [2, 24136.0] + - - [1536, 4608, 1, 4096, 1536, 1536, 1536, 4608] + - [2, 23433.0] + - - [6144, 1536, 1, 8192, 6144, 6144, 6144, 1536] + - [24, 23812.0] + - - [1024, 3072, 1, 8192, 1024, 1024, 1024, 3072] + - [2, 22356.0] + - - [1536, 1536, 1, 8192, 1536, 1536, 1536, 1536] + - [9, 20914.0] + - - [1536, 50304, 1, 8192, 1536, 1536, 1536, 50304] + - [2, 23840.0] + - - [2048, 6144, 1, 1024, 2048, 2048, 2048, 6144] + - [2, 23331.0] + - - [2048, 30592, 1, 1024, 2048, 2048, 2048, 30592] + - [14, 23975.0] + - - [1536, 6144, 1, 8192, 1536, 1536, 1536, 6144] + - [24, 23765.0] + - - [1024, 50304, 1, 2048, 1024, 1024, 1024, 50304] + - [29, 24097.0] + - - [1024, 50304, 1, 8192, 1024, 1024, 1024, 50304] + - [0, 23653.0] + - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] + - [2, 21918.0] + - - [1024, 50304, 1, 16384, 1024, 1024, 1024, 50304] + - [14, 23906.0] + - - [1024, 30528, 1, 8192, 1024, 1024, 1024, 30528] + - [9, 23994.0] + - - [256, 6912, 1, 1, 256, 256, 256, 6912] + - [3, 140.0] + - - [30528, 1024, 1, 640, 30528, 30528, 30528, 1024] + - [19, 23554.0] + - - [30528, 1024, 1, 1280, 30528, 30528, 30528, 1024] + - [14, 23837.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 4096, 1024] + - [19, 23206.0] + - - [1024, 4096, 1, 10240, 1024, 1024, 1024, 4096] + - [9, 23147.0] + - - [30528, 1024, 1, 1600, 30528, 30528, 30528, 1024] + - [29, 23874.0] + - - [1024, 4096, 1, 10496, 1024, 1024, 1024, 4096] + - [2, 23169.0] + - - [30528, 1024, 1, 1640, 30528, 30528, 30528, 1024] + - [14, 23797.0] + - - [4096, 1024, 1, 10496, 4096, 4096, 4096, 1024] + - [29, 23198.0] + - - [30528, 1024, 1, 160, 30528, 30528, 30528, 1024] + - [14, 22065.0] + - - [1024, 4096, 1, 6144, 1024, 1024, 1024, 4096] + - [2, 23153.0] + - - [30528, 1024, 1, 240, 30528, 30528, 30528, 1024] + - [19, 22792.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 4096, 1024] + - [2, 23119.0] + - - [1024, 4096, 1, 10224, 1024, 1024, 1024, 4096] + - [24, 23219.0] + - - [4096, 1024, 1, 10224, 4096, 4096, 4096, 1024] + - [14, 23232.0] + - - [1024, 3072, 1, 10224, 1024, 1024, 1024, 3072] + - [14, 22463.0] + - - [1024, 3072, 1, 10240, 1024, 1024, 1024, 3072] + - [24, 22083.0] + - - [4096, 1024, 1, 10192, 4096, 4096, 4096, 1024] + - [19, 23239.0] + - - [1024, 3072, 1, 10192, 1024, 1024, 1024, 3072] + - [2, 22492.0] + - - [1024, 4096, 1, 10192, 1024, 1024, 1024, 4096] + - [14, 23199.0] + - - [1024, 3072, 1, 10200, 1024, 1024, 1024, 3072] + - [2, 22474.0] + - - [4096, 1024, 1, 10208, 4096, 4096, 4096, 1024] + - [19, 23188.0] + - - [1024, 3072, 1, 10208, 1024, 1024, 1024, 3072] + - [2, 22535.0] + - - [1024, 4096, 1, 10208, 1024, 1024, 1024, 4096] + - [29, 23211.0] + - - [1024, 2048, 1, 10224, 1024, 1024, 1024, 2048] + - [2, 21160.0] + - - [1024, 2048, 1, 10240, 1024, 1024, 1024, 2048] + - [14, 21011.0] + - - [1024, 2048, 1, 10192, 1024, 1024, 1024, 2048] + - [2, 21035.0] + - - [1024, 3072, 1, 10080, 1024, 1024, 1024, 3072] + - [24, 22046.0] + - - [100352, 256, 1, 512, 100352, 100352, 100352, 256] + - [24, 23355.0] + - - [12544, 1024, 1, 2048, 12544, 12544, 12544, 1024] + - [2, 23546.0] + - - [12544, 147, 1, 64, 12544, 12544, 12544, 147] + - [23, 9721.0] + - - [200704, 256, 1, 512, 200704, 200704, 200704, 256] + - [29, 23656.0] + - - [25088, 512, 1, 1024, 25088, 25088, 25088, 512] + - [2, 23352.0] + - - [3136, 576, 1, 64, 3136, 3136, 3136, 576] + - [8, 12457.0] + - - [50176, 512, 1, 1024, 50176, 50176, 50176, 512] + - [14, 23768.0] + - - [6272, 1024, 1, 2048, 6272, 6272, 6272, 1024] + - [2, 23166.0] + - - [3136, 256, 128, 128, 3136, 3136, 3136, 256] + - [9, 21715.0] + - - [3136, 256, 256, 128, 3136, 3136, 3136, 256] + - [19, 21922.0] + - - [784, 512, 128, 256, 784, 784, 784, 512] + - [2, 20077.0] + - - [784, 512, 256, 256, 784, 784, 784, 512] + - [19, 20211.0] + - - [30528, 1024, 1, 2560, 30528, 30528, 30528, 1024] + - [24, 23953.0] + - - [1024, 4096, 1, 12288, 1024, 1024, 1024, 4096] + - [29, 23262.0] + - - [30528, 1024, 1, 1920, 30528, 30528, 30528, 1024] + - [24, 23910.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 4096, 1024] + - [29, 23276.0] + - - [25600, 128, 25, 128, 25600, 25600, 25600, 128] + - [2, 20621.0] + - - [12544, 128, 36, 128, 12544, 12544, 12544, 128] + - [16, 21289.0] + - - [9216, 128, 49, 128, 9216, 9216, 9216, 128] + - [21, 20785.0] + - - [6400, 128, 64, 128, 6400, 6400, 6400, 128] + - [16, 21402.0] + - - [6400, 256, 25, 256, 6400, 6400, 6400, 256] + - [2, 23140.0] + - - [4096, 256, 36, 256, 4096, 4096, 4096, 256] + - [2, 23024.0] + - - [2304, 256, 49, 256, 2304, 2304, 2304, 256] + - [2, 22912.0] + - - [2304, 256, 64, 256, 2304, 2304, 2304, 256] + - [9, 23086.0] + - - [2304, 512, 25, 512, 2304, 2304, 2304, 512] + - [2, 23551.0] + - - [1024, 512, 36, 512, 1024, 1024, 1024, 512] + - [2, 23339.0] + - - [1024, 512, 49, 512, 1024, 1024, 1024, 512] + - [2, 23466.0] + - - [1024, 512, 64, 512, 1024, 1024, 1024, 512] + - [2, 23570.0] + - - [3072, 768, 1, 2048, 3072, 3072, 3072, 768] + - [9, 20647.0] + - - [768, 3072, 1, 2048, 768, 768, 768, 3072] + - [14, 20720.0] + - - [3072, 768, 1, 4608, 3072, 3072, 3072, 768] + - [9, 20955.0] + - - [768, 3072, 1, 4608, 768, 768, 768, 3072] + - [19, 20825.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4096, 1024] + - [2, 23160.0] + - - [1024, 4096, 1, 4608, 1024, 1024, 1024, 4096] + - [29, 22950.0] + - - [196, 1024, 128, 512, 196, 196, 196, 1024] + - [29, 17614.0] + - - [196, 1024, 256, 512, 196, 196, 196, 1024] + - [29, 17775.0] + - - [4880, 256, 49, 256, 4880, 4880, 4880, 256] + - [19, 22596.0] + - - [3128, 256, 64, 256, 3128, 3128, 3128, 256] + - [14, 22520.0] + - - [4680, 256, 49, 256, 4680, 4680, 4680, 256] + - [24, 22800.0] + - - [5280, 256, 36, 256, 5280, 5280, 5280, 256] + - [2, 22619.0] + - - [2640, 256, 64, 256, 2640, 2640, 2640, 256] + - [24, 22482.0] + - - [5304, 256, 49, 256, 5304, 5304, 5304, 256] + - [2, 22797.0] + - - [2760, 256, 64, 256, 2760, 2760, 2760, 256] + - [9, 22455.0] + - - [6440, 256, 36, 256, 6440, 6440, 6440, 256] + - [24, 22782.0] + - - [5704, 256, 36, 256, 5704, 5704, 5704, 256] + - [9, 22793.0] + - - [2128, 256, 64, 256, 2128, 2128, 2128, 256] + - [2, 22337.0] + - - [1160, 256, 49, 256, 1160, 1160, 1160, 256] + - [2, 20153.0] + - - [4056, 256, 49, 256, 4056, 4056, 4056, 256] + - [2, 22755.0] + - - [6144, 256, 36, 256, 6144, 6144, 6144, 256] + - [2, 23168.0] + - - [6336, 256, 36, 256, 6336, 6336, 6336, 256] + - [9, 22911.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 512] + - [9, 20575.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 512] + - [2, 21208.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 128] + - [9, 21363.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 128] + - [2, 20762.0] + - - [5632, 256, 36, 256, 5632, 5632, 5632, 256] + - [14, 23163.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 128] + - [2, 21076.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 128] + - [2, 19681.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 128] + - [2, 20681.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 512] + - [24, 21418.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 128] + - [24, 21931.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 128] + - [29, 19364.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 128] + - [29, 20811.0] + - - [13600, 512, 2, 256, 13600, 13600, 13600, 512] + - [2, 22316.0] + - - [15200, 512, 2, 256, 15200, 15200, 15200, 512] + - [9, 22345.0] + - - [768, 2048, 2, 512, 768, 768, 768, 2048] + - [2, 21418.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 512] + - [29, 20291.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 128] + - [29, 19258.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 512] + - [19, 21524.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 512] + - [29, 20103.0] + - - [6912, 256, 36, 256, 6912, 6912, 6912, 256] + - [29, 23259.0] + - - [13824, 512, 2, 256, 13824, 13824, 13824, 512] + - [19, 22389.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 512] + - [24, 20420.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 512] + - [2, 21395.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 128] + - [2, 21024.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 128] + - [2, 19958.0] + - - [864, 2048, 2, 512, 864, 864, 864, 2048] + - [2, 19947.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 512] + - [5, 20159.0] + - - [672, 2048, 2, 512, 672, 672, 672, 2048] + - [14, 18307.0] + - - [9408, 128, 2, 512, 9408, 9408, 9408, 128] + - [2, 19041.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 512] + - [19, 20266.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 512] + - [29, 19463.0] + - - [1240, 256, 49, 256, 1240, 1240, 1240, 256] + - [14, 21459.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 256] + - [2, 19508.0] + - - [888, 2048, 2, 512, 888, 888, 888, 2048] + - [2, 20532.0] + - - [12880, 512, 2, 256, 12880, 12880, 12880, 512] + - [19, 21316.0] + - - [12288, 512, 2, 256, 12288, 12288, 12288, 512] + - [14, 21889.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 128] + - [9, 20696.0] + - - [864, 2048, 2, 256, 864, 864, 864, 2048] + - [9, 17778.0] + - - [12672, 128, 2, 512, 12672, 12672, 12672, 128] + - [14, 21094.0] + - - [11264, 128, 2, 512, 11264, 11264, 11264, 128] + - [2, 20912.0] + - - [11776, 128, 2, 512, 11776, 11776, 11776, 128] + - [2, 19809.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 128] + - [29, 21237.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 1024] + - [2, 20656.0] + - - [14000, 128, 2, 512, 14000, 14000, 14000, 128] + - [2, 20335.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 512] + - [29, 20774.0] + - - [768, 2048, 2, 256, 768, 768, 768, 2048] + - [24, 19119.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 1024] + - [2, 20514.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 256] + - [24, 19608.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 1024] + - [2, 18769.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 1024] + - [9, 19152.0] + - - [15200, 256, 2, 12, 15200, 15200, 15200, 256] + - [8, 4288.0] + - - [12880, 256, 2, 12, 12880, 12880, 12880, 256] + - [23, 4310.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 1024] + - [24, 19976.0] + - - [13600, 256, 2, 12, 13600, 13600, 13600, 256] + - [1, 4536.0] + - - [15200, 256, 2, 3, 15200, 15200, 15200, 256] + - [1, 1205.0] + - - [12880, 256, 2, 3, 12880, 12880, 12880, 256] + - [8, 1136.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 1024] + - [2, 20692.0] + - - [12288, 256, 2, 12, 12288, 12288, 12288, 256] + - [23, 4410.0] + - - [13824, 256, 2, 12, 13824, 13824, 13824, 256] + - [20, 4973.0] + - - [13600, 256, 2, 3, 13600, 13600, 13600, 256] + - [1, 1163.0] + - - [7600, 512, 1, 256, 7600, 7600, 7600, 512] + - [2, 18774.0] + - - [6144, 512, 1, 256, 6144, 6144, 6144, 512] + - [14, 19011.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [14, 23414.0] + - - [3800, 256, 2, 3, 3800, 3800, 3800, 256] + - [1, 829.0] + - - [13824, 256, 2, 3, 13824, 13824, 13824, 256] + - [31, 1017.0] + - - [12288, 256, 2, 3, 12288, 12288, 12288, 256] + - [8, 917.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 256] + - [2, 18432.0] + - - [3072, 256, 2, 12, 3072, 3072, 3072, 256] + - [15, 3457.0] + - - [3800, 256, 2, 12, 3800, 3800, 3800, 256] + - [1, 2417.0] + - - [3072, 256, 2, 3, 3072, 3072, 3072, 256] + - [27, 605.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 256] + - [29, 19820.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 512] + - [14, 20852.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 256] + - [14, 18831.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 256] + - [27, 18070.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 256] + - [9, 19518.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 1024] + - [5, 19643.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 256] + - [2, 19021.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 1024] + - [2, 19850.0] + - - [3456, 256, 2, 3, 3456, 3456, 3456, 256] + - [1, 665.0] + - - [3400, 256, 2, 3, 3400, 3400, 3400, 256] + - [23, 610.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 1024] + - [29, 20719.0] + - - [3456, 256, 2, 12, 3456, 3456, 3456, 256] + - [25, 2469.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 1024] + - [14, 20718.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 256] + - [9, 20633.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 256] + - [9, 19560.0] + - - [51520, 256, 2, 12, 51520, 51520, 51520, 256] + - [30, 7957.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 256] + - [14, 19570.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 1024] + - [2, 19853.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 1024] + - [29, 20780.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 256] + - [19, 18631.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 256] + - [14, 18132.0] + - - [54400, 256, 2, 12, 54400, 54400, 54400, 256] + - [23, 8870.0] + - - [55296, 256, 2, 3, 55296, 55296, 55296, 256] + - [23, 2117.0] + - - [60800, 256, 2, 12, 60800, 60800, 60800, 256] + - [18, 8432.0] + - - [51520, 256, 2, 3, 51520, 51520, 51520, 256] + - [22, 2257.0] + - - [55296, 256, 2, 12, 55296, 55296, 55296, 256] + - [15, 7875.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 1024] + - [24, 19897.0] + - - [60800, 256, 2, 3, 60800, 60800, 60800, 256] + - [13, 2640.0] + - - [952, 256, 64, 256, 952, 952, 952, 256] + - [29, 20536.0] + - - [49152, 256, 2, 12, 49152, 49152, 49152, 256] + - [28, 6161.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 256] + - [2, 18983.0] + - - [736, 256, 64, 256, 736, 736, 736, 256] + - [2, 20977.0] + - - [600, 256, 64, 256, 600, 600, 600, 256] + - [2, 20357.0] + - - [1440, 256, 49, 256, 1440, 1440, 1440, 256] + - [24, 21109.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 1024] + - [2, 21376.0] + - - [1368, 256, 49, 256, 1368, 1368, 1368, 256] + - [14, 21634.0] + - - [49152, 256, 2, 3, 49152, 49152, 49152, 256] + - [13, 1610.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 256] + - [2, 20414.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 1024] + - [9, 20843.0] + - - [54400, 256, 2, 3, 54400, 54400, 54400, 256] + - [23, 2139.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 1024] + - [2, 21406.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 256] + - [2, 17596.0] + - - [616, 256, 64, 256, 616, 616, 616, 256] + - [24, 20965.0] + - - [3008, 256, 64, 256, 3008, 3008, 3008, 256] + - [9, 22478.0] + - - [896, 256, 64, 256, 896, 896, 896, 256] + - [14, 22574.0] + - - [768, 256, 64, 256, 768, 768, 768, 256] + - [14, 22488.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 256] + - [2, 20291.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 1024] + - [2, 21356.0] + - - [800, 256, 64, 256, 800, 800, 800, 256] + - [24, 19923.0] + - - [1120, 256, 49, 256, 1120, 1120, 1120, 256] + - [2, 21568.0] + - - [2408, 256, 64, 256, 2408, 2408, 2408, 256] + - [24, 22652.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 256] + - [2, 19785.0] + - - [672, 256, 64, 256, 672, 672, 672, 256] + - [2, 19266.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 1024] + - [24, 22027.0] + - - [1064, 256, 49, 256, 1064, 1064, 1064, 256] + - [19, 20487.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 256] + - [9, 19275.0] + - - [704, 256, 64, 256, 704, 704, 704, 256] + - [2, 20082.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 1024] + - [16, 21121.0] + - - [3264, 256, 64, 256, 3264, 3264, 3264, 256] + - [2, 22614.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 1024] + - [2, 21455.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 256] + - [2, 21520.0] + - - [6440, 512, 1, 256, 6440, 6440, 6440, 512] + - [2, 18867.0] + - - [6912, 512, 1, 256, 6912, 6912, 6912, 512] + - [2, 20991.0] + - - [6800, 512, 1, 256, 6800, 6800, 6800, 512] + - [2, 19693.0] + - - [6800, 512, 1, 1024, 6800, 6800, 6800, 512] + - [29, 21310.0] + - - [6440, 512, 1, 1024, 6440, 6440, 6440, 512] + - [14, 20141.0] + - - [6912, 512, 1, 1024, 6912, 6912, 6912, 512] + - [24, 22022.0] + - - [1728, 1024, 1, 512, 1728, 1728, 1728, 1024] + - [4, 17681.0] + - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] + - [19, 18685.0] + - - [7600, 512, 1, 1024, 7600, 7600, 7600, 512] + - [24, 21812.0] + - - [6144, 512, 1, 1024, 6144, 6144, 6144, 512] + - [29, 21458.0] + - - [1728, 1024, 1, 2048, 1728, 1728, 1728, 1024] + - [14, 19521.0] + - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] + - [14, 20790.0] + - - [4524, 256, 49, 256, 4524, 4524, 4524, 256] + - [29, 22611.0] + - - [2666, 256, 64, 256, 2666, 2666, 2666, 256] + - [2, 22721.0] + - - [950, 2048, 2, 512, 950, 950, 950, 2048] + - [2, 19987.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 1024] + - [19, 20827.0] + - - [782, 128, 64, 128, 782, 782, 782, 128] + - [9, 17299.0] + - - [850, 2048, 2, 512, 850, 850, 850, 2048] + - [24, 19593.0] + - - [805, 2048, 2, 512, 805, 805, 805, 2048] + - [2, 18720.0] + - - [713, 2048, 2, 512, 713, 713, 713, 2048] + - [9, 19334.0] + - - [660, 2048, 2, 512, 660, 660, 660, 2048] + - [14, 18022.0] + - - [726, 2048, 2, 512, 726, 726, 726, 2048] + - [2, 19630.0] + - - [805, 2048, 2, 256, 805, 805, 805, 2048] + - [29, 17505.0] + - - [1251, 256, 49, 256, 1251, 1251, 1251, 256] + - [2, 21582.0] + - - [1900, 1024, 1, 2048, 1900, 1900, 1900, 1024] + - [2, 21703.0] + - - [1610, 1024, 1, 2048, 1610, 1610, 1610, 1024] + - [2, 18463.0] + - - [1900, 1024, 1, 512, 1900, 1900, 1900, 1024] + - [6, 19411.0] + - - [3220, 256, 2, 12, 3220, 3220, 3220, 256] + - [15, 1910.0] + - - [3220, 256, 2, 3, 3220, 3220, 3220, 256] + - [3, 565.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 1024] + - [2, 20200.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 256] + - [9, 19229.0] + - - [850, 2048, 2, 256, 850, 850, 850, 2048] + - [30, 17594.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 1024] + - [2, 20422.0] + - - [950, 2048, 2, 256, 950, 950, 950, 2048] + - [12, 19105.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 1024] + - [14, 21099.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 256] + - [29, 18728.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 256] + - [9, 20615.0] + - - [1269, 256, 49, 256, 1269, 1269, 1269, 256] + - [24, 21961.0] + - - [1467, 256, 49, 256, 1467, 1467, 1467, 256] + - [14, 21413.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 256] + - [5, 19463.0] + - - [1449, 256, 49, 256, 1449, 1449, 1449, 256] + - [9, 21133.0] + - - [1278, 256, 49, 256, 1278, 1278, 1278, 256] + - [2, 21935.0] + - - [1413, 256, 49, 256, 1413, 1413, 1413, 256] + - [24, 20712.0] + - - [1341, 256, 49, 256, 1341, 1341, 1341, 256] + - [24, 21527.0] + - - [1287, 256, 49, 256, 1287, 1287, 1287, 256] + - [19, 20359.0] + - - [1332, 256, 49, 256, 1332, 1332, 1332, 256] + - [2, 21177.0] + - - [1359, 256, 49, 256, 1359, 1359, 1359, 256] + - [2, 21622.0] + - - [1395, 256, 49, 256, 1395, 1395, 1395, 256] + - [24, 22046.0] + - - [1323, 256, 49, 256, 1323, 1323, 1323, 256] + - [2, 20997.0] + - - [1404, 256, 49, 256, 1404, 1404, 1404, 256] + - [2, 22182.0] + - - [1386, 256, 49, 256, 1386, 1386, 1386, 256] + - [2, 21938.0] + - - [1350, 256, 49, 256, 1350, 1350, 1350, 256] + - [2, 21371.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 1024] + - [9, 20914.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 256] + - [14, 17827.0] + - - [690, 256, 64, 256, 690, 690, 690, 256] + - [2, 19866.0] + - - [660, 256, 64, 256, 660, 660, 660, 256] + - [2, 18891.0] + - - [782, 256, 64, 256, 782, 782, 782, 256] + - [14, 19484.0] + - - [884, 256, 64, 256, 884, 884, 884, 256] + - [2, 21960.0] + - - [1610, 1024, 1, 512, 1610, 1610, 1610, 1024] + - [2, 17368.0] + - - [1700, 1024, 1, 512, 1700, 1700, 1700, 1024] + - [2, 18377.0] + - - [1700, 1024, 1, 2048, 1700, 1700, 1700, 1024] + - [9, 19615.0] + - - [1444, 128, 120, 256, 1444, 1444, 1444, 128] + - [19, 20994.0] + - - [1444, 128, 18, 256, 1444, 1444, 1444, 128] + - [2, 18621.0] + - - [1444, 128, 19, 256, 1444, 1444, 1444, 128] + - [2, 18943.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [19, 21496.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [24, 19890.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [24, 20312.0] + - - [361, 512, 120, 256, 361, 361, 361, 512] + - [9, 20832.0] + - - [361, 512, 18, 256, 361, 361, 361, 512] + - [10, 18037.0] + - - [361, 512, 19, 256, 361, 361, 361, 512] + - [14, 18567.0] + - - [1920, 25216, 1, 16384, 1920, 1920, 1920, 25216] + - [9, 24010.0] + - - [3840, 1920, 1, 16384, 3840, 3840, 3840, 1920] + - [9, 23464.0] + - - [1920, 3840, 1, 16384, 1920, 1920, 1920, 3840] + - [29, 23447.0] + - - [960, 1920, 1, 16384, 960, 960, 960, 1920] + - [9, 21188.0] + - - [1920, 2880, 1, 16384, 1920, 1920, 1920, 2880] + - [29, 22370.0] + - - [1920, 25216, 1, 4096, 1920, 1920, 1920, 25216] + - [24, 24137.0] + - - [3840, 1920, 1, 4096, 3840, 3840, 3840, 1920] + - [2, 23288.0] + - - [1920, 3840, 1, 4096, 1920, 1920, 1920, 3840] + - [2, 23348.0] + - - [960, 1920, 1, 4096, 960, 960, 960, 1920] + - [14, 20708.0] + - - [1920, 2880, 1, 4096, 1920, 1920, 1920, 2880] + - [29, 22169.0] + - - [1920, 25216, 1, 8192, 1920, 1920, 1920, 25216] + - [24, 23866.0] + - - [3840, 1920, 1, 8192, 3840, 3840, 3840, 1920] + - [2, 23412.0] + - - [1920, 3840, 1, 8192, 1920, 1920, 1920, 3840] + - [29, 23428.0] + - - [960, 1920, 1, 8192, 960, 960, 960, 1920] + - [2, 21295.0] + - - [1920, 2880, 1, 8192, 1920, 1920, 1920, 2880] + - [14, 22021.0] + - - [2304, 12672, 1, 16384, 2304, 2304, 2304, 12672] + - [2, 23580.0] + - - [2304, 2304, 1, 16384, 2304, 2304, 2304, 2304] + - [29, 22806.0] + - - [576, 2304, 1, 16384, 576, 576, 576, 2304] + - [2, 18426.0] + - - [2304, 1728, 1, 16384, 2304, 2304, 2304, 1728] + - [24, 22080.0] + - - [2304, 12672, 1, 4096, 2304, 2304, 2304, 12672] + - [9, 23999.0] + - - [2304, 2304, 1, 4096, 2304, 2304, 2304, 2304] + - [9, 22678.0] + - - [576, 2304, 1, 4096, 576, 576, 576, 2304] + - [14, 17789.0] + - - [2304, 1728, 1, 4096, 2304, 2304, 2304, 1728] + - [29, 21766.0] + - - [2304, 12672, 1, 8192, 2304, 2304, 2304, 12672] + - [29, 24027.0] + - - [2304, 2304, 1, 8192, 2304, 2304, 2304, 2304] + - [2, 22779.0] + - - [576, 2304, 1, 8192, 576, 576, 576, 2304] + - [24, 18184.0] + - - [2304, 1728, 1, 8192, 2304, 2304, 2304, 1728] + - [19, 21905.0] + - - [3072, 6400, 1, 4096, 3072, 3072, 3072, 6400] + - [14, 24009.0] + - - [1536, 3072, 1, 4096, 1536, 1536, 1536, 3072] + - [2, 22743.0] + - - [3072, 1536, 1, 4096, 3072, 3072, 3072, 1536] + - [14, 22556.0] + - - [384, 3072, 1, 4096, 384, 384, 384, 3072] + - [29, 19667.0] + - - [3072, 1152, 1, 4096, 3072, 3072, 3072, 1152] + - [29, 22712.0] + - - [3072, 6400, 1, 8192, 3072, 3072, 3072, 6400] + - [9, 24033.0] + - - [1536, 3072, 1, 8192, 1536, 1536, 1536, 3072] + - [2, 22810.0] + - - [3072, 1536, 1, 8192, 3072, 3072, 3072, 1536] + - [2, 22812.0] + - - [384, 3072, 1, 8192, 384, 384, 384, 3072] + - [14, 20024.0] + - - [3072, 1152, 1, 8192, 3072, 3072, 3072, 1152] + - [2, 23047.0] + - - [2048, 2048, 1, 4096, 2048, 2048, 2048, 2048] + - [2, 22923.0] + - - [2048, 2048, 1, 8, 2048, 2048, 2048, 2048] + - [0, 2980.0] + - - [2048, 29000, 1, 199, 2048, 2048, 2048, 29000] + - [19, 22083.0] + - - [2048, 29000, 1, 221, 2048, 2048, 2048, 29000] + - [14, 22260.0] + - - [2048, 29000, 1, 224, 2048, 2048, 2048, 29000] + - [14, 23054.0] + - - [2048, 29000, 1, 229, 2048, 2048, 2048, 29000] + - [14, 22374.0] + - - [2048, 29000, 1, 234, 2048, 2048, 2048, 29000] + - [14, 22490.0] + - - [2048, 29000, 1, 242, 2048, 2048, 2048, 29000] + - [2, 22605.0] + - - [2048, 29000, 1, 246, 2048, 2048, 2048, 29000] + - [9, 22570.0] + - - [2048, 29000, 1, 247, 2048, 2048, 2048, 29000] + - [19, 22505.0] + - - [2048, 29000, 1, 256, 2048, 2048, 2048, 29000] + - [14, 23147.0] + - - [2048, 29000, 1, 262, 2048, 2048, 2048, 29000] + - [14, 22736.0] + - - [2048, 29000, 1, 264, 2048, 2048, 2048, 29000] + - [7, 22819.0] + - - [2048, 29000, 1, 265, 2048, 2048, 2048, 29000] + - [19, 22532.0] + - - [2048, 29000, 1, 274, 2048, 2048, 2048, 29000] + - [9, 22763.0] + - - [2048, 29000, 1, 277, 2048, 2048, 2048, 29000] + - [9, 22689.0] + - - [2048, 29000, 1, 279, 2048, 2048, 2048, 29000] + - [2, 22697.0] + - - [2048, 29000, 1, 288, 2048, 2048, 2048, 29000] + - [14, 23245.0] + - - [2048, 29000, 1, 296, 2048, 2048, 2048, 29000] + - [7, 23017.0] + - - [2048, 29000, 1, 315, 2048, 2048, 2048, 29000] + - [14, 22813.0] + - - [2048, 29000, 1, 335, 2048, 2048, 2048, 29000] + - [14, 22873.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [2, 23518.0] + - - [4096, 2048, 1, 4096, 4096, 4096, 4096, 2048] + - [2, 23474.0] + - - [1024, 29000, 1, 2283, 1024, 1024, 1024, 29000] + - [24, 23816.0] + - - [1024, 29000, 1, 2296, 1024, 1024, 1024, 29000] + - [14, 23838.0] + - - [1024, 29000, 1, 2306, 1024, 1024, 1024, 29000] + - [9, 23834.0] + - - [1024, 29000, 1, 2309, 1024, 1024, 1024, 29000] + - [14, 23830.0] + - - [1024, 29000, 1, 2318, 1024, 1024, 1024, 29000] + - [14, 23818.0] + - - [1024, 29000, 1, 2320, 1024, 1024, 1024, 29000] + - [29, 23884.0] + - - [1024, 29000, 1, 2324, 1024, 1024, 1024, 29000] + - [14, 23828.0] + - - [1024, 29000, 1, 2325, 1024, 1024, 1024, 29000] + - [14, 23825.0] + - - [1024, 29000, 1, 2329, 1024, 1024, 1024, 29000] + - [24, 23829.0] + - - [1024, 29000, 1, 2338, 1024, 1024, 1024, 29000] + - [19, 23818.0] + - - [1024, 29000, 1, 2345, 1024, 1024, 1024, 29000] + - [2, 23813.0] + - - [1024, 29000, 1, 2350, 1024, 1024, 1024, 29000] + - [14, 23835.0] + - - [1024, 29000, 1, 2362, 1024, 1024, 1024, 29000] + - [24, 23840.0] + - - [1024, 29000, 1, 2366, 1024, 1024, 1024, 29000] + - [14, 23827.0] + - - [1024, 29000, 1, 2368, 1024, 1024, 1024, 29000] + - [14, 23895.0] + - - [1024, 29000, 1, 2374, 1024, 1024, 1024, 29000] + - [14, 23832.0] + - - [1024, 29000, 1, 2390, 1024, 1024, 1024, 29000] + - [24, 23835.0] + - - [1024, 29000, 1, 561, 1024, 1024, 1024, 29000] + - [29, 23209.0] + - - [1024, 29000, 1, 574, 1024, 1024, 1024, 29000] + - [19, 23275.0] + - - [1024, 29000, 1, 600, 1024, 1024, 1024, 29000] + - [19, 23325.0] + - - [1024, 29000, 1, 608, 1024, 1024, 1024, 29000] + - [9, 23524.0] + - - [1024, 29000, 1, 615, 1024, 1024, 1024, 29000] + - [14, 23304.0] + - - [1024, 29000, 1, 622, 1024, 1024, 1024, 29000] + - [19, 23309.0] + - - [1024, 29000, 1, 625, 1024, 1024, 1024, 29000] + - [14, 23321.0] + - - [1024, 29000, 1, 626, 1024, 1024, 1024, 29000] + - [14, 23340.0] + - - [1024, 29000, 1, 628, 1024, 1024, 1024, 29000] + - [14, 23344.0] + - - [1024, 29000, 1, 636, 1024, 1024, 1024, 29000] + - [9, 23335.0] + - - [1024, 29000, 1, 651, 1024, 1024, 1024, 29000] + - [2, 23316.0] + - - [1024, 29000, 1, 658, 1024, 1024, 1024, 29000] + - [14, 23402.0] + - - [1024, 29000, 1, 669, 1024, 1024, 1024, 29000] + - [14, 23347.0] + - - [1024, 29000, 1, 670, 1024, 1024, 1024, 29000] + - [14, 23360.0] + - - [1024, 29000, 1, 672, 1024, 1024, 1024, 29000] + - [14, 23576.0] + - - [1024, 29000, 1, 684, 1024, 1024, 1024, 29000] + - [14, 23370.0] + - - [1024, 29000, 1, 716, 1024, 1024, 1024, 29000] + - [14, 23408.0] + - - [1024, 29000, 1, 730, 1024, 1024, 1024, 29000] + - [19, 23419.0] + - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] + - [14, 23268.0] + - - [2560, 2560, 1, 2, 2560, 2560, 2560, 2560] + - [12, 602.0] + - - [2560, 29000, 1, 109, 2560, 2560, 2560, 29000] + - [19, 20354.0] + - - [2560, 29000, 1, 121, 2560, 2560, 2560, 29000] + - [14, 20839.0] + - - [2560, 29000, 1, 27, 2560, 2560, 2560, 29000] + - [0, 8999.0] + - - [2560, 29000, 1, 35, 2560, 2560, 2560, 29000] + - [27, 11181.0] + - - [2560, 29000, 1, 36, 2560, 2560, 2560, 29000] + - [31, 11426.0] + - - [2560, 29000, 1, 39, 2560, 2560, 2560, 29000] + - [27, 12247.0] + - - [2560, 29000, 1, 40, 2560, 2560, 2560, 29000] + - [27, 12551.0] + - - [2560, 29000, 1, 42, 2560, 2560, 2560, 29000] + - [27, 12933.0] + - - [2560, 29000, 1, 43, 2560, 2560, 2560, 29000] + - [17, 13173.0] + - - [2560, 29000, 1, 44, 2560, 2560, 2560, 29000] + - [27, 13444.0] + - - [2560, 29000, 1, 46, 2560, 2560, 2560, 29000] + - [15, 13730.0] + - - [2560, 29000, 1, 48, 2560, 2560, 2560, 29000] + - [25, 14423.0] + - - [2560, 29000, 1, 49, 2560, 2560, 2560, 29000] + - [13, 14560.0] + - - [2560, 29000, 1, 50, 2560, 2560, 2560, 29000] + - [18, 14827.0] + - - [2560, 29000, 1, 51, 2560, 2560, 2560, 29000] + - [5, 14843.0] + - - [2560, 29000, 1, 53, 2560, 2560, 2560, 29000] + - [6, 15377.0] + - - [2560, 29000, 1, 54, 2560, 2560, 2560, 29000] + - [18, 15549.0] + - - [2560, 29000, 1, 55, 2560, 2560, 2560, 29000] + - [6, 15621.0] + - - [2560, 29000, 1, 56, 2560, 2560, 2560, 29000] + - [6, 15810.0] + - - [2560, 29000, 1, 57, 2560, 2560, 2560, 29000] + - [3, 15866.0] + - - [2560, 29000, 1, 58, 2560, 2560, 2560, 29000] + - [3, 16132.0] + - - [2560, 29000, 1, 59, 2560, 2560, 2560, 29000] + - [11, 16045.0] + - - [2560, 29000, 1, 61, 2560, 2560, 2560, 29000] + - [18, 16281.0] + - - [2560, 29000, 1, 63, 2560, 2560, 2560, 29000] + - [3, 16505.0] + - - [2560, 29000, 1, 65, 2560, 2560, 2560, 29000] + - [18, 17748.0] + - - [2560, 29000, 1, 66, 2560, 2560, 2560, 29000] + - [18, 17947.0] + - - [2560, 29000, 1, 67, 2560, 2560, 2560, 29000] + - [13, 17804.0] + - - [2560, 29000, 1, 69, 2560, 2560, 2560, 29000] + - [13, 17931.0] + - - [2560, 29000, 1, 70, 2560, 2560, 2560, 29000] + - [13, 18095.0] + - - [2560, 29000, 1, 71, 2560, 2560, 2560, 29000] + - [6, 17898.0] + - - [2560, 29000, 1, 73, 2560, 2560, 2560, 29000] + - [6, 17972.0] + - - [2560, 29000, 1, 74, 2560, 2560, 2560, 29000] + - [6, 18184.0] + - - [2560, 29000, 1, 75, 2560, 2560, 2560, 29000] + - [6, 17945.0] + - - [2560, 29000, 1, 77, 2560, 2560, 2560, 29000] + - [6, 17932.0] + - - [2560, 29000, 1, 78, 2560, 2560, 2560, 29000] + - [6, 18196.0] + - - [2560, 29000, 1, 80, 2560, 2560, 2560, 29000] + - [2, 20258.0] + - - [2560, 29000, 1, 81, 2560, 2560, 2560, 29000] + - [14, 19024.0] + - - [2560, 29000, 1, 82, 2560, 2560, 2560, 29000] + - [14, 19191.0] + - - [2560, 29000, 1, 83, 2560, 2560, 2560, 29000] + - [14, 19192.0] + - - [2560, 29000, 1, 84, 2560, 2560, 2560, 29000] + - [14, 19307.0] + - - [2560, 29000, 1, 88, 2560, 2560, 2560, 29000] + - [14, 19601.0] + - - [2560, 29000, 1, 89, 2560, 2560, 2560, 29000] + - [14, 19435.0] + - - [2560, 29000, 1, 90, 2560, 2560, 2560, 29000] + - [19, 19632.0] + - - [2560, 29000, 1, 92, 2560, 2560, 2560, 29000] + - [14, 19732.0] + - - [2560, 29000, 1, 95, 2560, 2560, 2560, 29000] + - [14, 19692.0] + - - [2560, 29000, 1, 98, 2560, 2560, 2560, 29000] + - [9, 20342.0] + - - [2560, 4096, 1, 1024, 2560, 2560, 2560, 4096] + - [9, 23622.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 2560] + - [19, 23716.0] + - - [1024, 3072, 1, 32768, 1024, 1024, 1024, 3072] + - [19, 22573.0] + - - [1024, 4096, 1, 32768, 1024, 1024, 1024, 4096] + - [2, 23320.0] + - - [1024, 50304, 1, 32768, 1024, 1024, 1024, 50304] + - [2, 24005.0] + - - [4096, 1024, 1, 32768, 4096, 4096, 4096, 1024] + - [29, 23300.0] + - - [1024, 128, 24, 1024, 1024, 1024, 1024, 128] + - [14, 21709.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [24, 21821.0] + - - [256, 2560, 1, 8976, 256, 256, 256, 2560] + - [32, 21113.0] + - - [256, 2816, 1, 8976, 256, 256, 256, 2816] + - [32, 21155.0] + - - [256, 3328, 1, 8976, 256, 256, 256, 3328] + - [36, 20565.0] + - - [256, 3584, 1, 8976, 256, 256, 256, 3584] + - [35, 20433.0] + - - [256, 3840, 1, 8976, 256, 256, 256, 3840] + - [2, 21395.0] + - - [256, 4096, 1, 8976, 256, 256, 256, 4096] + - [35, 20988.0] + - - [256, 4352, 1, 8976, 256, 256, 256, 4352] + - [35, 21169.0] + - - [480, 1024, 1, 32768, 480, 480, 480, 1024] + - [32, 19284.0] + - - [1024, 256, 1, 21248, 1024, 1024, 1024, 256] + - [35, 17325.0] + - - [1024, 256, 1, 21504, 1024, 1024, 1024, 256] + - [32, 17145.0] + - - [1024, 256, 1, 22016, 1024, 1024, 1024, 256] + - [35, 17105.0] + - - [1024, 256, 1, 28672, 1024, 1024, 1024, 256] + - [33, 16900.0] + - - [1024, 256, 1, 33536, 1024, 1024, 1024, 256] + - [32, 17524.0] + - - [1024, 512, 1, 32768, 1024, 1024, 1024, 512] + - [32, 20365.0] + - - [1024, 1024, 1, 32768, 1024, 1024, 1024, 1024] + - [36, 22591.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 1024, 1024] + - [32, 21778.0] + - - [1024, 1024, 1, 9520, 1024, 1024, 1024, 1024] + - [32, 21670.0] + - - [1024, 1024, 1, 10064, 1024, 1024, 1024, 1024] + - [32, 21826.0] + - - [1024, 1024, 1, 10080, 1024, 1024, 1024, 1024] + - [32, 21781.0] + - - [1024, 1024, 1, 10200, 1024, 1024, 1024, 1024] + - [32, 21926.0] + - - [479, 1024, 1, 32768, 479, 479, 479, 1024] + - [32, 19011.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 1024, 1024] + - [32, 21453.0] + - - [1024, 1024, 1, 9600, 1024, 1024, 1024, 1024] + - [32, 21743.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 1024, 1024] + - [32, 22184.0] + - - [512, 256, 1, 55296, 512, 512, 512, 256] + - [34, 15439.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 1024, 1024] + - [32, 21779.0] + - - [1024, 1024, 1, 10496, 1024, 1024, 1024, 1024] + - [32, 21791.0] + - - [1024, 1024, 1, 10224, 1024, 1024, 1024, 1024] + - [32, 21927.0] + - - [1024, 1024, 1, 10192, 1024, 1024, 1024, 1024] + - [32, 21928.0] + - - [1024, 1024, 1, 10208, 1024, 1024, 1024, 1024] + - [32, 21897.0] + - - [1024, 1024, 1, 10184, 1024, 1024, 1024, 1024] + - [32, 21834.0] + - - [1024, 1024, 1, 10120, 1024, 1024, 1024, 1024] + - [32, 21918.0] + - - [1024, 1024, 1, 10152, 1024, 1024, 1024, 1024] + - [32, 21902.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 1024, 1024] + - [32, 22085.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [49, 11969.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [38, 10566.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [56, 13259.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [52, 13854.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [50, 12108.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [49, 14829.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [49, 15362.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [49, 14502.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [46, 14077.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [59, 9325.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [55, 13378.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [39, 8447.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [67, 12870.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [41, 15603.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [68, 15329.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [43, 8245.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [60, 14156.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [38, 8218.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 16699.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [49, 17615.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [54, 18818.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [37, 17032.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [58, 17079.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [55, 17491.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [37, 14684.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [54, 17860.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [58, 20681.0] + - - [128, 64, 512, 128, 128, 128, 128, 64] + - [49, 18462.0] + - - [512, 64, 64, 512, 512, 512, 512, 64] + - [39, 18622.0] + - - [1024, 1024, 1, 4, 1024, 1024, 1024, 1024] + - [48, 1205.0] + - - [1024, 1024, 1, 32, 1024, 1024, 1024, 1024] + - [48, 8347.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] + - [39, 17425.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 1024] + - [60, 17605.0] + - - [256, 1280, 1, 8976, 256, 256, 256, 1280] + - [40, 17608.0] + - - [257, 4096, 1, 1024, 257, 257, 257, 4096] + - [42, 12763.0] + - - [512, 2048, 1, 256, 512, 512, 512, 2048] + - [39, 13682.0] + - - [560, 1024, 1, 200, 560, 560, 560, 1024] + - [40, 9370.0] + - - [560, 1024, 1, 1600, 560, 560, 560, 1024] + - [40, 16074.0] + - - [1024, 1024, 1, 200, 1024, 1024, 1024, 1024] + - [39, 12710.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] + - [39, 15543.0] + - - [1024, 1024, 1, 960, 1024, 1024, 1024, 1024] + - [60, 16975.0] + - - [1024, 1024, 1, 1600, 1024, 1024, 1024, 1024] + - [50, 16855.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 256] + - [61, 14263.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 1024, 1024] + - [60, 17304.0] + - - [1024, 1024, 1, 3968, 1024, 1024, 1024, 1024] + - [60, 17316.0] + - - [1024, 1024, 1, 6528, 1024, 1024, 1024, 1024] + - [60, 17445.0] + - - [1024, 1024, 1, 7104, 1024, 1024, 1024, 1024] + - [50, 17498.0] + - - [1024, 1024, 1, 7200, 1024, 1024, 1024, 1024] + - [62, 17521.0] + - - [1024, 1024, 1, 8064, 1024, 1024, 1024, 1024] + - [50, 17526.0] + - - [1024, 1024, 1, 8160, 1024, 1024, 1024, 1024] + - [50, 17532.0] + - - [1024, 1024, 1, 3240, 1024, 1024, 1024, 1024] + - [50, 17179.0] + - - [1024, 1024, 1, 3960, 1024, 1024, 1024, 1024] + - [59, 19587.0] + - - [64, 1280, 64, 192, 64, 64, 64, 1280] + - [49, 19381.0] + - - [64, 1280, 64, 320, 64, 64, 64, 1280] + - [48, 19414.0] + - - [64, 1280, 64, 384, 64, 64, 64, 1280] + - [44, 20303.0] + - - [64, 1280, 64, 448, 64, 64, 64, 1280] + - [49, 18997.0] + - - [64, 2048, 64, 192, 64, 64, 64, 2048] + - [55, 19443.0] + - - [64, 2048, 64, 320, 64, 64, 64, 2048] + - [58, 18912.0] + - - [64, 2048, 64, 384, 64, 64, 64, 2048] + - [64, 18789.0] + - - [64, 2048, 64, 448, 64, 64, 64, 2048] + - [59, 19412.0] + - - [5329, 64, 64, 80, 5329, 5329, 5329, 64] + - [66, 18084.0] + - - [64, 1280, 32, 192, 64, 64, 64, 1280] + - [44, 17963.0] + - - [64, 1280, 32, 320, 64, 64, 64, 1280] + - [37, 17856.0] + - - [64, 1280, 32, 384, 64, 64, 64, 1280] + - [55, 18648.0] + - - [64, 1280, 32, 448, 64, 64, 64, 1280] + - [38, 18695.0] + - - [64, 2048, 32, 192, 64, 64, 64, 2048] + - [49, 18886.0] + - - [64, 2048, 32, 320, 64, 64, 64, 2048] + - [55, 19196.0] + - - [64, 2048, 32, 384, 64, 64, 64, 2048] + - [49, 20032.0] + - - [64, 2048, 32, 448, 64, 64, 64, 2048] + - [38, 20097.0] + - - [5329, 64, 32, 80, 5329, 5329, 5329, 64] + - [49, 18250.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 64] + - [49, 19458.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [58, 14733.0] + - - [196, 256, 32, 1024, 196, 196, 196, 256] + - [42, 14555.0] + - - [3136, 64, 64, 128, 3136, 3136, 3136, 64] + - [38, 18637.0] + - - [3136, 64, 32, 128, 3136, 3136, 3136, 64] + - [38, 18482.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [38, 16363.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 960] + - [50, 18579.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [43, 13422.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1024, 1024] + - [53, 151.0] + - - [1024, 1024, 1, 77, 1024, 1024, 1024, 1024] + - [56, 8393.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [49, 10980.0] + - - [1024, 1024, 1, 10, 1024, 1024, 1024, 1024] + - [50, 1638.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [39, 16673.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [49, 15774.0] + - - [1024, 1024, 1, 39, 1024, 1024, 1024, 1024] + - [54, 5112.0] + - - [1024, 1024, 1, 780, 1024, 1024, 1024, 1024] + - [39, 16012.0] + - - [1024, 1024, 1, 4992, 1024, 1024, 1024, 1024] + - [50, 17386.0] + - - [1024, 1024, 1, 308, 1024, 1024, 1024, 1024] + - [39, 14115.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [49, 16070.0] + - - [1024, 1024, 1, 40, 1024, 1024, 1024, 1024] + - [54, 5825.0] + - - [1024, 1024, 1, 800, 1024, 1024, 1024, 1024] + - [50, 16232.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 1024, 1024] + - [56, 17459.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [49, 16567.0] + - - [1024, 1024, 1, 41, 1024, 1024, 1024, 1024] + - [54, 5387.0] + - - [1024, 1024, 1, 820, 1024, 1024, 1024, 1024] + - [50, 16175.0] + - - [1024, 1024, 1, 5248, 1024, 1024, 1024, 1024] + - [60, 17401.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [48, 18457.0] + - - [1024, 1024, 1, 5, 1024, 1024, 1024, 1024] + - [58, 1507.0] + - - [1024, 1024, 1, 385, 1024, 1024, 1024, 1024] + - [39, 16187.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 1024, 1024] + - [50, 17517.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [48, 17687.0] + - - [1024, 1024, 1, 6, 1024, 1024, 1024, 1024] + - [50, 1011.0] + - - [1024, 1024, 1, 462, 1024, 1024, 1024, 1024] + - [50, 15101.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] + - [56, 17365.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [38, 9840.0] + - - [1024, 1024, 1, 8, 1024, 1024, 1024, 1024] + - [48, 1422.0] + - - [1024, 1024, 1, 160, 1024, 1024, 1024, 1024] + - [38, 12633.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [59, 10120.0] + - - [1024, 1024, 1, 9, 1024, 1024, 1024, 1024] + - [60, 1508.0] + - - [1024, 1024, 1, 180, 1024, 1024, 1024, 1024] + - [45, 12369.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1024, 1024] + - [39, 16593.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 512] + - [55, 180.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [49, 19215.0] + - - [1024, 64, 128, 1024, 1024, 1024, 1024, 64] + - [39, 19376.0] + - - [1024, 64, 32, 1024, 1024, 1024, 1024, 64] + - [50, 18801.0] + - - [1024, 96, 64, 1024, 1024, 1024, 1024, 96] + - [47, 15551.0] + - - [1024, 1024, 1, 16, 1024, 1024, 1024, 1024] + - [48, 4132.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [59, 16202.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [65, 19028.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [49, 16107.0] + - - [512, 64, 256, 512, 512, 512, 512, 64] + - [56, 17993.0] + - - [1024, 96, 128, 1024, 1024, 1024, 1024, 96] + - [53, 15856.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [49, 19288.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [58, 19097.0] + - - [512, 64, 128, 512, 512, 512, 512, 64] + - [55, 19109.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [37, 19096.0] + - - [1024, 64, 64, 1024, 1024, 1024, 1024, 64] + - [60, 19006.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [42, 15906.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [58, 17342.0] + - - [1024, 64, 256, 1024, 1024, 1024, 1024, 64] + - [60, 19721.0] + - - [512, 64, 40, 512, 512, 512, 512, 64] + - [50, 17963.0] + - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] + - [50, 11984.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [48, 18506.0] + - - [128, 64, 1024, 128, 128, 128, 128, 64] + - [65, 18991.0] + - - [1024, 1024, 1, 3456, 1024, 1024, 1024, 1024] + - [43, 18989.0] + - - [1024, 1024, 1, 6912, 1024, 1024, 1024, 1024] + - [39, 17695.0] + - - [1024, 1024, 1, 864, 1024, 1024, 1024, 1024] + - [56, 16903.0] + - - [1024, 512, 1, 3456, 1024, 1024, 1024, 512] + - [49, 15905.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 512] + - [53, 15886.0] + - - [1024, 512, 1, 6912, 1024, 1024, 1024, 512] + - [65, 16187.0] + - - [1024, 512, 1, 864, 1024, 1024, 1024, 512] + - [59, 14191.0] + - - [256, 3456, 1, 1, 256, 256, 256, 3456] + - [61, 133.0] + - - [256, 4096, 1, 1, 256, 256, 256, 4096] + - [60, 168.0] + - - [480, 1024, 1, 3456, 480, 480, 480, 1024] + - [42, 14720.0] + - - [480, 1024, 1, 4096, 480, 480, 480, 1024] + - [42, 14821.0] + - - [480, 1024, 1, 6912, 480, 480, 480, 1024] + - [63, 15030.0] + - - [480, 1024, 1, 864, 480, 480, 480, 1024] + - [40, 12791.0] + - - [1024, 1024, 1, 80, 1024, 1024, 1024, 1024] + - [56, 9425.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [49, 16900.0] + - - [128, 64, 1280, 128, 128, 128, 128, 64] + - [50, 19125.0] + - - [1024, 1024, 1, 82, 1024, 1024, 1024, 1024] + - [48, 11619.0] + - - [128, 64, 1312, 128, 128, 128, 128, 64] + - [56, 18955.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [44, 17384.0] + - - [1024, 1024, 1, 12, 1024, 1024, 1024, 1024] + - [55, 3294.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 1024, 1024] + - [44, 17834.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [48, 17740.0] + - - [512, 64, 192, 512, 512, 512, 512, 64] + - [48, 19115.0] + - - [784, 1152, 1, 128, 784, 784, 784, 1152] + - [48, 11446.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [37, 13853.0] + - - [128, 64, 2048, 128, 128, 128, 128, 64] + - [37, 12583.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 1024] + - [39, 13449.0] + - - [128, 64, 1536, 128, 128, 128, 128, 64] + - [66, 18659.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [64, 18191.0] + - - [1024, 1024, 1, 96, 1024, 1024, 1024, 1024] + - [50, 13245.0] + - - [92416, 64, 25, 64, 92416, 92416, 92416, 64] + - [56, 9907.0] + - - [50176, 64, 36, 64, 50176, 50176, 50176, 64] + - [56, 9978.0] + - - [36864, 64, 49, 64, 36864, 36864, 36864, 64] + - [56, 10102.0] + - - [25600, 64, 64, 64, 25600, 25600, 25600, 64] + - [45, 10096.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [38, 13789.0] + - - [128, 64, 192, 128, 128, 128, 128, 64] + - [56, 14738.0] + - - [768, 768, 1, 2048, 768, 768, 768, 768] + - [42, 17890.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [49, 19758.0] + - - [384, 64, 144, 384, 384, 384, 384, 64] + - [65, 19525.0] + - - [768, 768, 1, 4608, 768, 768, 768, 768] + - [42, 18168.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [54, 18154.0] + - - [512, 64, 48, 512, 512, 512, 512, 64] + - [39, 17568.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [49, 13069.0] + - - [128, 64, 256, 128, 128, 128, 128, 64] + - [50, 13435.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [44, 19417.0] + - - [384, 64, 192, 384, 384, 384, 384, 64] + - [58, 20173.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 1024, 1024] + - [50, 17618.0] + - - [196, 2304, 1, 256, 196, 196, 196, 2304] + - [51, 7790.0] + - - [768, 512, 2, 2048, 768, 768, 768, 512] + - [39, 17420.0] + - - [672, 512, 2, 2048, 672, 672, 672, 512] + - [59, 16448.0] + - - [1008, 512, 2, 2048, 1008, 1008, 1008, 512] + - [39, 16650.0] + - - [864, 512, 2, 2048, 864, 864, 864, 512] + - [53, 17712.0] + - - [888, 512, 2, 2048, 888, 888, 888, 512] + - [40, 18193.0] + - - [840, 512, 2, 2048, 840, 840, 840, 512] + - [63, 17210.0] + - - [768, 256, 2, 12, 768, 768, 768, 256] + - [51, 717.0] + - - [864, 256, 2, 3, 864, 864, 864, 256] + - [50, 215.0] + - - [864, 256, 2, 12, 864, 864, 864, 256] + - [41, 843.0] + - - [768, 256, 2, 3, 768, 768, 768, 256] + - [44, 237.0] + - - [1024, 320, 1, 1024, 1024, 1024, 1024, 320] + - [39, 13729.0] + - - [173280, 64, 1, 128, 173280, 173280, 173280, 64] + - [45, 9809.0] + - - [25992, 64, 1, 128, 25992, 25992, 25992, 64] + - [56, 11790.0] + - - [713, 512, 2, 2048, 713, 713, 713, 512] + - [50, 15938.0] + - - [660, 512, 2, 2048, 660, 660, 660, 512] + - [59, 16057.0] + - - [726, 512, 2, 2048, 726, 726, 726, 512] + - [39, 16273.0] + - - [748, 512, 2, 2048, 748, 748, 748, 512] + - [39, 16741.0] + - - [805, 512, 2, 2048, 805, 805, 805, 512] + - [53, 16528.0] + - - [850, 512, 2, 2048, 850, 850, 850, 512] + - [57, 17435.0] + - - [850, 256, 2, 3, 850, 850, 850, 256] + - [41, 216.0] + - - [805, 256, 2, 12, 805, 805, 805, 256] + - [50, 778.0] + - - [805, 256, 2, 3, 805, 805, 805, 256] + - [48, 212.0] + - - [850, 256, 2, 12, 850, 850, 850, 256] + - [52, 819.0] + - - [950, 256, 2, 12, 950, 950, 950, 256] + - [50, 915.0] + - - [950, 256, 2, 3, 950, 950, 950, 256] + - [60, 240.0] + - - [100, 512, 120, 128, 100, 100, 100, 512] + - [59, 11434.0] + - - [100, 512, 18, 128, 100, 100, 100, 512] + - [58, 7670.0] + - - [100, 512, 19, 128, 100, 100, 100, 512] + - [64, 8659.0] + - - [1444, 576, 1, 128, 1444, 1444, 1444, 576] + - [39, 9489.0] + - - [27436, 64, 1, 128, 27436, 27436, 27436, 64] + - [48, 11817.0] + - - [361, 2304, 1, 512, 361, 361, 361, 2304] + - [40, 14205.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [38, 16432.0] + - - [1024, 96, 160, 1024, 1024, 1024, 1024, 96] + - [47, 16022.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [42, 15792.0] + - - [1024, 96, 40, 1024, 1024, 1024, 1024, 96] + - [47, 15556.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [42, 16016.0] + - - [1024, 96, 80, 1024, 1024, 1024, 1024, 96] + - [42, 15823.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [65, 16564.0] + - - [1024, 96, 96, 1024, 1024, 1024, 1024, 96] + - [53, 15744.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [51, 15015.0] + - - [1024, 96, 24, 1024, 1024, 1024, 1024, 96] + - [57, 14882.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [49, 15588.0] + - - [1024, 96, 48, 1024, 1024, 1024, 1024, 96] + - [44, 15503.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [59, 14381.0] + - - [1024, 96, 16, 1024, 1024, 1024, 1024, 96] + - [55, 14345.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [51, 15479.0] + - - [1024, 96, 32, 1024, 1024, 1024, 1024, 96] + - [51, 15140.0] + - - [512, 64, 320, 512, 512, 512, 512, 64] + - [66, 18055.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [48, 17275.0] + - - [1024, 1024, 1, 20, 1024, 1024, 1024, 1024] + - [69, 4161.0] + - - [512, 64, 80, 512, 512, 512, 512, 64] + - [60, 18904.0] + - - [1024, 64, 512, 1024, 1024, 1024, 1024, 64] + - [60, 19886.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [37, 20104.0] + - - [512, 256, 1, 32768, 512, 512, 512, 256] + - [72, 15803.0] + - - [1024, 256, 1, 8192, 1024, 1024, 1024, 256] + - [74, 16995.0] + - - [1024, 256, 1, 8448, 1024, 1024, 1024, 256] + - [70, 15515.0] + - - [1024, 256, 1, 9728, 1024, 1024, 1024, 256] + - [75, 15796.0] + - - [1024, 256, 1, 9984, 1024, 1024, 1024, 256] + - [70, 15791.0] + - - [1024, 256, 1, 10496, 1024, 1024, 1024, 256] + - [75, 15864.0] + - - [1024, 256, 1, 11520, 1024, 1024, 1024, 256] + - [76, 16038.0] + - - [1024, 256, 1, 12032, 1024, 1024, 1024, 256] + - [70, 16029.0] + - - [1024, 256, 1, 13568, 1024, 1024, 1024, 256] + - [76, 16277.0] + - - [1024, 256, 1, 14336, 1024, 1024, 1024, 256] + - [75, 16320.0] + - - [1024, 256, 1, 14848, 1024, 1024, 1024, 256] + - [71, 16236.0] + - - [1024, 256, 1, 15104, 1024, 1024, 1024, 256] + - [76, 16359.0] + - - [1024, 256, 1, 15872, 1024, 1024, 1024, 256] + - [75, 16387.0] + - - [1024, 256, 1, 16128, 1024, 1024, 1024, 256] + - [75, 16428.0] + - - [1024, 256, 1, 17152, 1024, 1024, 1024, 256] + - [76, 16498.0] + - - [1024, 256, 1, 17408, 1024, 1024, 1024, 256] + - [70, 16496.0] + - - [1024, 256, 1, 18944, 1024, 1024, 1024, 256] + - [73, 16621.0] + - - [1024, 256, 1, 19712, 1024, 1024, 1024, 256] + - [76, 16693.0] + - - [1024, 256, 1, 19968, 1024, 1024, 1024, 256] + - [70, 16635.0] + - - [256, 128, 1, 55296, 256, 256, 256, 128] + - [76, 9383.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [120, 10202.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [88, 10716.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [120, 11316.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [88, 11065.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [118, 9723.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [84, 5557.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [152, 4295.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [84, 5215.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [156, 10369.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [88, 7468.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [144, 8148.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [146, 9953.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [130, 10914.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [97, 5570.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [86, 9014.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [105, 10874.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [113, 8000.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [113, 10023.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [78, 11137.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [112, 4047.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [87, 4320.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [105, 10408.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [152, 9148.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [113, 8760.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [118, 7076.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [88, 10010.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [89, 4905.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [88, 9852.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [84, 9172.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [118, 9788.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [128, 6982.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [88, 5773.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [152, 8201.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [118, 10039.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [156, 9948.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [103, 7456.0] + - - [33, 32, 200, 33, 33, 33, 33, 32] + - [77, 1345.0] + - - [33, 32, 1600, 33, 33, 33, 33, 32] + - [82, 3223.0] + - - [67, 2048, 1, 512, 67, 67, 67, 2048] + - [78, 4779.0] + - - [74, 2048, 1, 512, 74, 74, 74, 2048] + - [78, 5337.0] + - - [74, 2048, 1, 960, 74, 74, 74, 2048] + - [128, 6524.0] + - - [100, 2048, 1, 512, 100, 100, 100, 2048] + - [140, 6363.0] + - - [512, 512, 1, 200, 512, 512, 512, 512] + - [103, 5813.0] + - - [512, 512, 1, 1600, 512, 512, 512, 512] + - [84, 9832.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 256] + - [113, 9334.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [144, 9582.0] + - - [1024, 256, 1, 2304, 1024, 1024, 1024, 256] + - [144, 10103.0] + - - [1024, 256, 1, 2816, 1024, 1024, 1024, 256] + - [118, 10468.0] + - - [1024, 256, 1, 3072, 1024, 1024, 1024, 256] + - [88, 10619.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [118, 10290.0] + - - [1024, 256, 1, 3584, 1024, 1024, 1024, 256] + - [118, 10331.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 256] + - [154, 10606.0] + - - [1024, 256, 1, 4352, 1024, 1024, 1024, 256] + - [144, 10712.0] + - - [1024, 256, 1, 4608, 1024, 1024, 1024, 256] + - [144, 10464.0] + - - [1024, 256, 1, 5120, 1024, 1024, 1024, 256] + - [118, 10492.0] + - - [1024, 256, 1, 5376, 1024, 1024, 1024, 256] + - [118, 10676.0] + - - [1024, 256, 1, 5632, 1024, 1024, 1024, 256] + - [154, 10671.0] + - - [1024, 256, 1, 6144, 1024, 1024, 1024, 256] + - [144, 10575.0] + - - [1024, 256, 1, 6400, 1024, 1024, 1024, 256] + - [147, 11770.0] + - - [1024, 256, 1, 7680, 1024, 1024, 1024, 256] + - [88, 10603.0] + - - [1024, 256, 1, 7936, 1024, 1024, 1024, 256] + - [118, 10760.0] + - - [32, 64, 4608, 32, 32, 32, 32, 64] + - [138, 10100.0] + - - [32, 64, 4608, 35, 32, 32, 32, 64] + - [82, 9647.0] + - - [34, 64, 4736, 24, 34, 34, 34, 64] + - [121, 4740.0] + - - [34, 64, 4736, 34, 34, 34, 34, 64] + - [82, 5534.0] + - - [35, 64, 4608, 35, 35, 35, 35, 64] + - [90, 5505.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [84, 10832.0] + - - [64, 32, 4608, 35, 64, 64, 64, 32] + - [126, 10499.0] + - - [64, 34, 4736, 24, 64, 64, 64, 34] + - [80, 5287.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [80, 5949.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [80, 5956.0] + - - [33, 64, 1920, 33, 33, 33, 33, 64] + - [78, 4982.0] + - - [64, 33, 1920, 33, 64, 64, 64, 33] + - [84, 5203.0] + - - [49, 512, 64, 2048, 49, 49, 49, 512] + - [90, 9851.0] + - - [49, 2048, 64, 512, 49, 49, 49, 2048] + - [156, 9956.0] + - - [49, 512, 32, 2048, 49, 49, 49, 512] + - [90, 9696.0] + - - [49, 2048, 32, 512, 49, 49, 49, 2048] + - [105, 9809.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 1024] + - [120, 9952.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 2048] + - [105, 10076.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 1024] + - [156, 9852.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 2048] + - [90, 9881.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [84, 8678.0] + - - [512, 480, 1, 512, 512, 512, 512, 480] + - [88, 10247.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [84, 9698.0] + - - [256, 864, 1, 1, 256, 256, 256, 864] + - [135, 118.0] + - - [512, 256, 1, 3456, 512, 512, 512, 256] + - [84, 9060.0] + - - [512, 256, 1, 4096, 512, 512, 512, 256] + - [90, 9171.0] + - - [512, 256, 1, 6912, 512, 512, 512, 256] + - [120, 9479.0] + - - [512, 256, 1, 864, 512, 512, 512, 256] + - [99, 7122.0] + - - [49, 4608, 1, 512, 49, 49, 49, 4608] + - [132, 6561.0] + - - [49, 2048, 128, 512, 49, 49, 49, 2048] + - [106, 10104.0] + - - [49, 2048, 256, 512, 49, 49, 49, 2048] + - [120, 10128.0] + - - [49, 512, 128, 2048, 49, 49, 49, 512] + - [111, 10619.0] + - - [49, 512, 256, 2048, 49, 49, 49, 512] + - [156, 10108.0] + - - [56, 512, 64, 512, 56, 56, 56, 512] + - [120, 10996.0] + - - [176, 256, 2, 3, 176, 176, 176, 256] + - [81, 159.0] + - - [176, 256, 2, 12, 176, 176, 176, 256] + - [79, 563.0] + - - [216, 256, 2, 3, 216, 216, 216, 256] + - [109, 80.0] + - - [192, 256, 2, 12, 192, 192, 192, 256] + - [145, 284.0] + - - [192, 256, 2, 3, 192, 192, 192, 256] + - [93, 73.0] + - - [216, 256, 2, 12, 216, 216, 216, 256] + - [109, 306.0] + - - [228, 256, 2, 12, 228, 228, 228, 256] + - [148, 539.0] + - - [228, 256, 2, 3, 228, 228, 228, 256] + - [137, 172.0] + - - [187, 256, 2, 12, 187, 187, 187, 256] + - [116, 452.0] + - - [247, 256, 2, 12, 247, 247, 247, 256] + - [119, 671.0] + - - [187, 256, 2, 3, 187, 187, 187, 256] + - [112, 148.0] + - - [221, 256, 2, 3, 221, 221, 221, 256] + - [123, 152.0] + - - [221, 256, 2, 12, 221, 221, 221, 256] + - [109, 317.0] + - - [247, 256, 2, 3, 247, 247, 247, 256] + - [127, 149.0] + - - [100, 2304, 1, 512, 100, 100, 100, 2304] + - [146, 6687.0] + - - [256, 128, 1, 32768, 256, 256, 256, 128] + - [159, 9225.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [166, 4.0] + - - [2560, 2, 1, 4, 2560, 2560, 2560, 2] + - [166, 5.0] + - - [2048, 2, 1, 8, 2048, 2048, 2048, 2] + - [167, 19.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [91, 3.0] + - - [25, 1152, 1, 256, 25, 25, 25, 1152] + - [131, 1241.0] + - - [9, 1152, 1, 256, 9, 9, 9, 1152] + - [151, 461.0] + - - [13, 512, 1, 32768, 13, 13, 13, 512] + - [157, 3099.0] + - - [1024, 2, 1, 4992, 1024, 1024, 1024, 2] + - [158, 416.0] + - - [1024, 2, 1, 5120, 1024, 1024, 1024, 2] + - [163, 422.0] + - - [1024, 2, 1, 5248, 1024, 1024, 1024, 2] + - [165, 423.0] + - - [256, 128, 1, 6912, 256, 256, 256, 128] + - [164, 6505.0] + - - [13, 512, 1, 55296, 13, 13, 13, 512] + - [160, 3179.0] + - - [13, 512, 1, 6912, 13, 13, 13, 512] + - [162, 1954.0] + - - [768, 2, 1, 4608, 768, 768, 768, 2] + - [161, 405.0] + - - [1024, 2, 1, 4608, 1024, 1024, 1024, 2] + - [165, 398.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [117, 4476.0] + - - [1, 64, 1, 256, 1, 1, 1, 64] + - [81, 3.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [96, 718.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [105, 7685.0] + - - [1, 64, 1, 1280, 1, 1, 1, 64] + - [81, 4.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [132, 4884.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [143, 3225.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [90, 4881.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [131, 523.0] + - - [1, 64, 1, 1, 1, 1, 1, 64] + - [81, 0.02] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [117, 3306.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [120, 5490.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [104, 2068.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [117, 3956.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [155, 1032.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [87, 4241.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [155, 393.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [89, 1391.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [131, 2751.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [146, 5468.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [117, 1349.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [94, 757.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [136, 0.00024271843644509215] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [105, 7013.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [125, 1335.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [119, 1317.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [124, 2482.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [155, 2036.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [123, 800.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [125, 1922.0] + - - [1, 1, 1, 256, 1, 1, 1, 1] + - [81, 0.04] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [102, 1948.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [151, 196.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [104, 1380.0] + - - [1, 1, 1, 1280, 1, 1, 1, 1] + - [81, 0.1] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [125, 2431.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [104, 2726.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [131, 696.0] + - - [1, 1, 1, 3328, 1, 1, 1, 1] + - [81, 0.12] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [117, 3918.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [100, 606.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [89, 2292.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [131, 1651.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [104, 1230.0] + - - [14, 64, 1, 14, 14, 14, 14, 64] + - [81, 3.0] + - - [15, 64, 1, 14, 15, 15, 15, 64] + - [77, 3.0] + - - [15, 64, 1, 15, 15, 15, 15, 64] + - [77, 3.0] + - - [17, 64, 1, 15, 17, 17, 17, 64] + - [79, 4.0] + - - [17, 64, 1, 17, 17, 17, 17, 64] + - [77, 4.0] + - - [21, 64, 1, 17, 21, 21, 21, 64] + - [77, 5.0] + - - [21, 64, 1, 21, 21, 21, 21, 64] + - [117, 7.0] + - - [24, 64, 1, 24, 24, 24, 24, 64] + - [100, 9.0] + - - [30, 64, 1, 30, 30, 30, 30, 64] + - [141, 19.0] + - - [30, 64, 1, 31, 30, 30, 30, 64] + - [87, 14.0] + - - [31, 64, 1, 31, 31, 31, 31, 64] + - [134, 14.0] + - - [32, 64, 1, 32, 32, 32, 32, 64] + - [77, 16.0] + - - [32, 64, 1, 35, 32, 32, 32, 64] + - [79, 16.0] + - - [34, 64, 1, 24, 34, 34, 34, 64] + - [81, 12.0] + - - [34, 64, 1, 34, 34, 34, 34, 64] + - [79, 16.0] + - - [35, 64, 1, 35, 35, 35, 35, 64] + - [142, 22.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [77, 3.0] + - - [64, 15, 1, 14, 64, 64, 64, 15] + - [103, 5.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [143, 6.0] + - - [64, 17, 1, 15, 64, 64, 64, 17] + - [77, 4.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [77, 4.0] + - - [64, 21, 1, 17, 64, 64, 64, 21] + - [77, 5.0] + - - [64, 21, 1, 21, 64, 64, 64, 21] + - [79, 7.0] + - - [64, 24, 1, 24, 64, 64, 64, 24] + - [81, 9.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [104, 23.0] + - - [64, 30, 1, 31, 64, 64, 64, 30] + - [147, 19.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [110, 26.0] + - - [64, 32, 1, 32, 64, 64, 64, 32] + - [122, 21.0] + - - [64, 32, 1, 35, 64, 64, 64, 32] + - [110, 16.0] + - - [64, 34, 1, 24, 64, 64, 64, 34] + - [113, 17.0] + - - [64, 34, 1, 34, 64, 64, 64, 34] + - [139, 17.0] + - - [64, 35, 1, 35, 64, 64, 64, 35] + - [79, 17.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [119, 3355.0] + - - [512, 64, 1, 512, 512, 512, 512, 64] + - [145, 3329.0] + - - [1024, 2, 1, 4, 1024, 1024, 1024, 2] + - [81, 5.0] + - - [1024, 2, 1, 32, 1024, 1024, 1024, 2] + - [114, 31.0] + - - [1024, 2, 1, 2048, 1024, 1024, 1024, 2] + - [117, 322.0] + - - [3, 64, 512, 3, 3, 3, 3, 64] + - [77, 49.0] + - - [5, 64, 512, 5, 5, 5, 5, 64] + - [110, 207.0] + - - [5, 64, 960, 5, 5, 5, 5, 64] + - [77, 263.0] + - - [9, 64, 512, 9, 9, 9, 9, 64] + - [81, 462.0] + - - [27, 128, 32768, 27, 27, 27, 27, 128] + - [116, 4019.0] + - - [512, 32, 1, 200, 512, 512, 512, 32] + - [125, 853.0] + - - [512, 32, 1, 1600, 512, 512, 512, 32] + - [89, 2900.0] + - - [1024, 64, 1, 512, 1024, 1024, 1024, 64] + - [103, 4835.0] + - - [1024, 64, 1, 960, 1024, 1024, 1024, 64] + - [120, 6026.0] + - - [14, 64, 10880, 14, 14, 14, 14, 64] + - [82, 2306.0] + - - [15, 64, 10880, 14, 15, 15, 15, 64] + - [82, 2481.0] + - - [15, 64, 7680, 15, 15, 15, 15, 64] + - [82, 2485.0] + - - [15, 64, 10880, 15, 15, 15, 15, 64] + - [82, 2517.0] + - - [17, 64, 7680, 15, 17, 17, 17, 64] + - [78, 2281.0] + - - [17, 64, 6144, 17, 17, 17, 17, 64] + - [150, 2553.0] + - - [17, 64, 7680, 17, 17, 17, 17, 64] + - [111, 2596.0] + - - [21, 64, 6144, 17, 21, 21, 21, 64] + - [115, 2326.0] + - - [21, 64, 6144, 21, 21, 21, 21, 64] + - [95, 2725.0] + - - [24, 64, 4736, 24, 24, 24, 24, 64] + - [106, 3750.0] + - - [30, 64, 2048, 30, 30, 30, 30, 64] + - [116, 4204.0] + - - [30, 64, 2048, 31, 30, 30, 30, 64] + - [142, 4320.0] + - - [31, 64, 2048, 31, 31, 31, 31, 64] + - [101, 4467.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [108, 2677.0] + - - [64, 15, 10880, 14, 64, 64, 64, 15] + - [106, 2626.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [108, 2711.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [80, 2763.0] + - - [64, 17, 7680, 15, 64, 64, 64, 17] + - [152, 2864.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [140, 3083.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [126, 3141.0] + - - [64, 21, 6144, 17, 64, 64, 64, 21] + - [150, 3516.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [80, 4652.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [108, 5770.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [84, 5916.0] + - - [64, 30, 2048, 31, 64, 64, 64, 30] + - [84, 6017.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [92, 6073.0] + - - [27, 64, 1920, 27, 27, 27, 27, 64] + - [81, 3546.0] + - - [27, 64, 1920, 33, 27, 27, 27, 64] + - [86, 4182.0] + - - [64, 27, 1920, 27, 64, 64, 64, 27] + - [148, 5214.0] + - - [64, 27, 1920, 33, 64, 64, 64, 27] + - [80, 5887.0] + - - [1024, 2, 1, 1, 1024, 1024, 1024, 2] + - [77, 1.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 2] + - [102, 219.0] + - - [1024, 2, 1, 10, 1024, 1024, 1024, 2] + - [77, 5.0] + - - [1024, 2, 1, 1280, 1024, 1024, 1024, 2] + - [117, 237.0] + - - [1024, 2, 1, 39, 1024, 1024, 1024, 2] + - [134, 18.0] + - - [1024, 2, 1, 40, 1024, 1024, 1024, 2] + - [136, 19.0] + - - [1024, 2, 1, 41, 1024, 1024, 1024, 2] + - [80, 28.0] + - - [1024, 2, 1, 5, 1024, 1024, 1024, 2] + - [107, 3.0] + - - [1024, 2, 1, 2560, 1024, 1024, 1024, 2] + - [129, 293.0] + - - [1024, 2, 1, 6, 1024, 1024, 1024, 2] + - [77, 3.0] + - - [1024, 2, 1, 3072, 1024, 1024, 1024, 2] + - [102, 303.0] + - - [1024, 2, 1, 8, 1024, 1024, 1024, 2] + - [77, 4.0] + - - [1024, 2, 1, 1024, 1024, 1024, 1024, 2] + - [102, 216.0] + - - [1024, 2, 1, 9, 1024, 1024, 1024, 2] + - [77, 4.0] + - - [1024, 2, 1, 1152, 1024, 1024, 1024, 2] + - [87, 224.0] + - - [4, 64, 32768, 4, 4, 4, 4, 64] + - [133, 300.0] + - - [4, 64, 38400, 4, 4, 4, 4, 64] + - [78, 303.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [80, 310.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [80, 310.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [96, 448.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [98, 433.0] + - - [5, 64, 1, 5, 5, 5, 5, 64] + - [151, 1.0] + - - [33, 32, 1, 33, 33, 33, 33, 32] + - [81, 8.0] + - - [1024, 2, 1, 16, 1024, 1024, 1024, 2] + - [81, 8.0] + - - [1024, 2, 1, 64, 1024, 1024, 1024, 2] + - [110, 28.0] + - - [256, 128, 1, 3456, 256, 256, 256, 128] + - [89, 4290.0] + - - [256, 128, 1, 4096, 256, 256, 256, 128] + - [131, 4403.0] + - - [256, 128, 1, 864, 256, 256, 256, 128] + - [119, 2955.0] + - - [1024, 2, 1, 80, 1024, 1024, 1024, 2] + - [77, 58.0] + - - [1024, 2, 1, 82, 1024, 1024, 1024, 2] + - [85, 35.0] + - - [1024, 2, 1, 12, 1024, 1024, 1024, 2] + - [79, 6.0] + - - [13, 512, 1, 3456, 13, 13, 13, 512] + - [145, 1118.0] + - - [13, 512, 1, 4096, 13, 13, 13, 512] + - [145, 1141.0] + - - [13, 512, 1, 864, 13, 13, 13, 512] + - [145, 664.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [84, 5843.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [84, 5843.0] + - - [1024, 2, 1, 128, 1024, 1024, 1024, 2] + - [125, 52.0] + - - [1024, 2, 1, 96, 1024, 1024, 1024, 2] + - [104, 66.0] + - - [768, 2, 1, 2048, 768, 768, 768, 2] + - [117, 208.0] + - - [1024, 81, 1, 1024, 1024, 1024, 1024, 81] + - [84, 5289.0] + - - [25, 256, 120, 128, 25, 25, 25, 256] + - [111, 4950.0] + - - [25, 256, 18, 128, 25, 25, 25, 256] + - [117, 3206.0] + - - [25, 256, 19, 128, 25, 25, 25, 256] + - [89, 2256.0] + - - [9, 256, 120, 128, 9, 9, 9, 256] + - [137, 2198.0] + - - [9, 256, 18, 128, 9, 9, 9, 256] + - [153, 912.0] + - - [9, 256, 19, 128, 9, 9, 9, 256] + - [149, 947.0] + - - [1024, 2, 1, 20, 1024, 1024, 1024, 2] + - [83, 10.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH_GB.yaml new file mode 100644 index 000000000..6fc46d70b --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HBH_GB.yaml @@ -0,0 +1,40898 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_HBH_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 +- [2, 3, 0, 1] +- - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [19, 23740.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [13, 23537.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [19, 23743.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [23, 23301.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [30, 22742.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [13, 23836.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [0, 14882.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [30, 18756.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [2, 19357.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [23, 17341.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [19, 16908.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [19, 16755.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [23, 20264.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 15396.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [23, 19958.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [8, 18985.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [8, 19256.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [23, 16448.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [30, 17003.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [2, 17674.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 18124.0] + - - [1024, 4096, 1, 2048, 1024, 1024, 1024, 4096] + - [2, 22848.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [2, 23077.0] + - - [1024, 30528, 1, 2048, 1024, 1024, 1024, 30528] + - [13, 23772.0] + - - [1024, 30528, 1, 4096, 1024, 1024, 1024, 30528] + - [19, 23839.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 1024] + - [2, 22810.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 1024] + - [8, 23053.0] + - - [256, 4864, 1, 8976, 256, 256, 256, 4864] + - [19, 21204.0] + - - [256, 5120, 1, 8976, 256, 256, 256, 5120] + - [19, 22161.0] + - - [256, 5632, 1, 8976, 256, 256, 256, 5632] + - [30, 19684.0] + - - [256, 5888, 1, 8976, 256, 256, 256, 5888] + - [30, 20692.0] + - - [256, 6144, 1, 8976, 256, 256, 256, 6144] + - [19, 21617.0] + - - [256, 7168, 1, 8976, 256, 256, 256, 7168] + - [30, 21277.0] + - - [256, 8192, 1, 8976, 256, 256, 256, 8192] + - [30, 21159.0] + - - [256, 8960, 1, 8976, 256, 256, 256, 8960] + - [30, 23017.0] + - - [256, 9728, 1, 8976, 256, 256, 256, 9728] + - [8, 22038.0] + - - [256, 9984, 1, 8976, 256, 256, 256, 9984] + - [8, 22611.0] + - - [256, 10240, 1, 8976, 256, 256, 256, 10240] + - [19, 23107.0] + - - [256, 10496, 1, 8976, 256, 256, 256, 10496] + - [8, 21327.0] + - - [256, 11008, 1, 8976, 256, 256, 256, 11008] + - [13, 22267.0] + - - [256, 11264, 1, 8976, 256, 256, 256, 11264] + - [30, 22679.0] + - - [256, 11520, 1, 8976, 256, 256, 256, 11520] + - [19, 23279.0] + - - [256, 11776, 1, 8976, 256, 256, 256, 11776] + - [8, 21658.0] + - - [256, 12544, 1, 8976, 256, 256, 256, 12544] + - [19, 22899.0] + - - [256, 12800, 1, 8976, 256, 256, 256, 12800] + - [19, 23319.0] + - - [256, 13312, 1, 8976, 256, 256, 256, 13312] + - [13, 22253.0] + - - [256, 13568, 1, 8976, 256, 256, 256, 13568] + - [13, 22634.0] + - - [256, 14336, 1, 8976, 256, 256, 256, 14336] + - [2, 22013.0] + - - [256, 14848, 1, 8976, 256, 256, 256, 14848] + - [8, 22780.0] + - - [256, 15104, 1, 8976, 256, 256, 256, 15104] + - [19, 23126.0] + - - [256, 15872, 1, 8976, 256, 256, 256, 15872] + - [23, 22585.0] + - - [256, 16128, 1, 8976, 256, 256, 256, 16128] + - [30, 22912.0] + - - [256, 17152, 1, 8976, 256, 256, 256, 17152] + - [8, 22695.0] + - - [256, 17408, 1, 8976, 256, 256, 256, 17408] + - [30, 23009.0] + - - [256, 18688, 1, 8976, 256, 256, 256, 18688] + - [8, 23053.0] + - - [256, 19968, 1, 8976, 256, 256, 256, 19968] + - [19, 23144.0] + - - [256, 20480, 1, 8976, 256, 256, 256, 20480] + - [30, 23642.0] + - - [256, 20992, 1, 8976, 256, 256, 256, 20992] + - [13, 22974.0] + - - [256, 21248, 1, 8976, 256, 256, 256, 21248] + - [8, 23216.0] + - - [256, 22016, 1, 8976, 256, 256, 256, 22016] + - [2, 22794.0] + - - [256, 26112, 1, 8976, 256, 256, 256, 26112] + - [19, 23225.0] + - - [256, 32512, 1, 8976, 256, 256, 256, 32512] + - [23, 23412.0] + - - [256, 32768, 1, 1, 256, 256, 256, 32768] + - [1, 437.0] + - - [256, 33536, 1, 8976, 256, 256, 256, 33536] + - [2, 23299.0] + - - [256, 44505, 1, 8976, 256, 256, 256, 44505] + - [19, 23739.0] + - - [768, 2048, 1, 256, 768, 768, 768, 2048] + - [3, 18154.0] + - - [1600, 1024, 1, 512, 1600, 1600, 1600, 1024] + - [2, 17282.0] + - - [1600, 1024, 1, 960, 1600, 1600, 1600, 1024] + - [8, 18075.0] + - - [2048, 960, 1, 1, 2048, 2048, 2048, 960] + - [1, 284.0] + - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] + - [13, 22148.0] + - - [2048, 2048, 1, 960, 2048, 2048, 2048, 2048] + - [13, 22613.0] + - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] + - [2, 22608.0] + - - [3200, 2048, 1, 1024, 3200, 3200, 3200, 2048] + - [30, 23320.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 4096] + - [2, 23570.0] + - - [1024, 4096, 1, 3840, 1024, 1024, 1024, 4096] + - [2, 23113.0] + - - [1024, 4096, 1, 3968, 1024, 1024, 1024, 4096] + - [23, 22936.0] + - - [1024, 4096, 1, 6528, 1024, 1024, 1024, 4096] + - [2, 23136.0] + - - [1024, 4096, 1, 7104, 1024, 1024, 1024, 4096] + - [23, 22694.0] + - - [1024, 4096, 1, 7200, 1024, 1024, 1024, 4096] + - [23, 22689.0] + - - [1024, 4096, 1, 8064, 1024, 1024, 1024, 4096] + - [33, 22182.0] + - - [1024, 4096, 1, 8160, 1024, 1024, 1024, 4096] + - [26, 22131.0] + - - [1024, 4096, 1, 9216, 1024, 1024, 1024, 4096] + - [30, 23194.0] + - - [1024, 4096, 1, 9520, 1024, 1024, 1024, 4096] + - [23, 23185.0] + - - [1024, 4096, 1, 10064, 1024, 1024, 1024, 4096] + - [30, 23187.0] + - - [1024, 4096, 1, 10080, 1024, 1024, 1024, 4096] + - [30, 23179.0] + - - [1024, 4096, 1, 10200, 1024, 1024, 1024, 4096] + - [13, 23164.0] + - - [1024, 42720, 1, 3968, 1024, 1024, 1024, 42720] + - [2, 23989.0] + - - [1024, 42720, 1, 6528, 1024, 1024, 1024, 42720] + - [13, 24011.0] + - - [1024, 42720, 1, 7104, 1024, 1024, 1024, 42720] + - [8, 24051.0] + - - [1024, 42720, 1, 7200, 1024, 1024, 1024, 42720] + - [8, 24026.0] + - - [1024, 42720, 1, 9520, 1024, 1024, 1024, 42720] + - [2, 23520.0] + - - [1024, 42720, 1, 10080, 1024, 1024, 1024, 42720] + - [2, 23506.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 4096, 1024] + - [2, 22918.0] + - - [4096, 1024, 1, 3968, 4096, 4096, 4096, 1024] + - [19, 22914.0] + - - [4096, 1024, 1, 6528, 4096, 4096, 4096, 1024] + - [2, 23045.0] + - - [4096, 1024, 1, 7104, 4096, 4096, 4096, 1024] + - [2, 23104.0] + - - [4096, 1024, 1, 7200, 4096, 4096, 4096, 1024] + - [15, 22558.0] + - - [4096, 1024, 1, 8064, 4096, 4096, 4096, 1024] + - [2, 23107.0] + - - [4096, 1024, 1, 8160, 4096, 4096, 4096, 1024] + - [2, 23105.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 4096, 1024] + - [13, 23121.0] + - - [4096, 1024, 1, 9520, 4096, 4096, 4096, 1024] + - [19, 23105.0] + - - [4096, 1024, 1, 10064, 4096, 4096, 4096, 1024] + - [13, 23110.0] + - - [4096, 1024, 1, 10080, 4096, 4096, 4096, 1024] + - [2, 23102.0] + - - [4096, 1024, 1, 10200, 4096, 4096, 4096, 1024] + - [2, 23098.0] + - - [1024, 4096, 1, 3240, 1024, 1024, 1024, 4096] + - [13, 22945.0] + - - [1024, 4096, 1, 3960, 1024, 1024, 1024, 4096] + - [19, 22735.0] + - - [1024, 42720, 1, 3960, 1024, 1024, 1024, 42720] + - [30, 24060.0] + - - [4096, 1024, 1, 3240, 4096, 4096, 4096, 1024] + - [8, 22939.0] + - - [4096, 1024, 1, 3960, 4096, 4096, 4096, 1024] + - [19, 22870.0] + - - [1225, 192, 64, 32, 1225, 1225, 1225, 192] + - [29, 9144.0] + - - [1225, 192, 64, 48, 1225, 1225, 1225, 192] + - [29, 15327.0] + - - [1225, 192, 64, 64, 1225, 1225, 1225, 192] + - [18, 15971.0] + - - [1225, 256, 64, 48, 1225, 1225, 1225, 256] + - [7, 15819.0] + - - [1225, 256, 64, 64, 1225, 1225, 1225, 256] + - [7, 16924.0] + - - [1225, 288, 64, 48, 1225, 1225, 1225, 288] + - [7, 12608.0] + - - [1225, 288, 64, 64, 1225, 1225, 1225, 288] + - [22, 14387.0] + - - [289, 768, 64, 128, 289, 289, 289, 768] + - [19, 15354.0] + - - [289, 768, 64, 160, 289, 289, 289, 768] + - [19, 15907.0] + - - [289, 768, 64, 192, 289, 289, 289, 768] + - [19, 16221.0] + - - [1225, 192, 32, 32, 1225, 1225, 1225, 192] + - [25, 8967.0] + - - [1225, 192, 32, 48, 1225, 1225, 1225, 192] + - [7, 13156.0] + - - [1225, 192, 32, 64, 1225, 1225, 1225, 192] + - [1, 15579.0] + - - [1225, 256, 32, 48, 1225, 1225, 1225, 256] + - [7, 12663.0] + - - [1225, 256, 32, 64, 1225, 1225, 1225, 256] + - [22, 15858.0] + - - [1225, 288, 32, 48, 1225, 1225, 1225, 288] + - [29, 12053.0] + - - [1225, 288, 32, 64, 1225, 1225, 1225, 288] + - [1, 14497.0] + - - [289, 768, 32, 128, 289, 289, 289, 768] + - [2, 14677.0] + - - [289, 768, 32, 160, 289, 289, 289, 768] + - [2, 15432.0] + - - [289, 768, 32, 192, 289, 289, 289, 768] + - [2, 15596.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 256] + - [8, 19339.0] + - - [784, 128, 32, 512, 784, 784, 784, 128] + - [19, 18120.0] + - - [784, 512, 32, 128, 784, 784, 784, 512] + - [13, 18285.0] + - - [196, 1024, 32, 256, 196, 196, 196, 1024] + - [30, 15794.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 128] + - [19, 22160.0] + - - [784, 256, 64, 512, 784, 784, 784, 256] + - [2, 20063.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 256] + - [2, 21516.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [2, 22506.0] + - - [196, 512, 64, 1024, 196, 196, 196, 512] + - [23, 17469.0] + - - [784, 512, 64, 256, 784, 784, 784, 512] + - [2, 19774.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [13, 20322.0] + - - [196, 1024, 64, 512, 196, 196, 196, 1024] + - [23, 17258.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [2, 17719.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 128] + - [2, 21867.0] + - - [784, 256, 32, 512, 784, 784, 784, 256] + - [23, 19460.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 256] + - [2, 21109.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [8, 22309.0] + - - [196, 512, 32, 1024, 196, 196, 196, 512] + - [13, 17042.0] + - - [784, 512, 32, 256, 784, 784, 784, 512] + - [2, 19490.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [13, 20115.0] + - - [196, 1024, 32, 512, 196, 196, 196, 1024] + - [8, 16973.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [19, 17401.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [13, 23875.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [2, 23913.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [23, 22808.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 7680] + - [19, 23892.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 3840] + - [2, 23894.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 1920] + - [2, 22737.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [13, 23880.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [2, 23758.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 22421.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 4096] + - [2, 21424.0] + - - [1024, 30522, 1, 77, 1024, 1024, 1024, 30522] + - [12, 17346.0] + - - [4096, 1024, 1, 512, 4096, 4096, 4096, 1024] + - [19, 22217.0] + - - [1024, 4096, 1, 1280, 1024, 1024, 1024, 4096] + - [8, 22842.0] + - - [1024, 30522, 1, 200, 1024, 1024, 1024, 30522] + - [17, 22416.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 4096, 1024] + - [2, 22861.0] + - - [1024, 4096, 1, 4992, 1024, 1024, 1024, 4096] + - [2, 23104.0] + - - [1024, 30522, 1, 780, 1024, 1024, 1024, 30522] + - [8, 23458.0] + - - [4096, 1024, 1, 4992, 4096, 4096, 4096, 1024] + - [8, 23159.0] + - - [1024, 30522, 1, 308, 1024, 1024, 1024, 30522] + - [23, 22737.0] + - - [1024, 4096, 1, 5120, 1024, 1024, 1024, 4096] + - [2, 23169.0] + - - [1024, 30522, 1, 800, 1024, 1024, 1024, 30522] + - [19, 23658.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 4096, 1024] + - [8, 23185.0] + - - [1024, 4096, 1, 5248, 1024, 1024, 1024, 4096] + - [2, 23158.0] + - - [1024, 30522, 1, 820, 1024, 1024, 1024, 30522] + - [8, 23502.0] + - - [4096, 1024, 1, 5248, 4096, 4096, 4096, 1024] + - [8, 23186.0] + - - [1024, 4096, 1, 2560, 1024, 1024, 1024, 4096] + - [19, 23111.0] + - - [1024, 30522, 1, 385, 1024, 1024, 1024, 30522] + - [13, 22852.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 1024] + - [13, 23059.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 4096] + - [13, 22851.0] + - - [1024, 30522, 1, 462, 1024, 1024, 1024, 30522] + - [30, 23093.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 4096, 1024] + - [13, 23103.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 4096] + - [2, 22192.0] + - - [1024, 30522, 1, 160, 1024, 1024, 1024, 30522] + - [19, 21897.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 22689.0] + - - [1024, 4096, 1, 1152, 1024, 1024, 1024, 4096] + - [19, 22790.0] + - - [1024, 30522, 1, 180, 1024, 1024, 1024, 30522] + - [19, 21853.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 4096, 1024] + - [2, 22854.0] + - - [1024, 4096, 1, 8192, 1024, 1024, 1024, 4096] + - [13, 23217.0] + - - [1024, 4096, 1, 9600, 1024, 1024, 1024, 4096] + - [30, 23233.0] + - - [1024, 33712, 1, 8192, 1024, 1024, 1024, 33712] + - [13, 24032.0] + - - [1024, 33712, 1, 9600, 1024, 1024, 1024, 33712] + - [8, 24041.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 4096, 1024] + - [2, 23163.0] + - - [4096, 1024, 1, 9600, 4096, 4096, 4096, 1024] + - [2, 23197.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1600] + - [12, 251.0] + - - [2560, 1920, 1, 2048, 2560, 2560, 2560, 1920] + - [23, 23410.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 3072] + - [2, 22384.0] + - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] + - [19, 23601.0] + - - [2048, 2048, 1, 2, 2048, 2048, 2048, 2048] + - [18, 625.0] + - - [1024, 30592, 1, 2048, 1024, 1024, 1024, 30592] + - [13, 24003.0] + - - [1024, 3072, 1, 16384, 1024, 1024, 1024, 3072] + - [13, 22474.0] + - - [6144, 1536, 1, 4096, 6144, 6144, 6144, 1536] + - [2, 23708.0] + - - [1536, 4608, 1, 8192, 1536, 1536, 1536, 4608] + - [13, 23472.0] + - - [640, 2560, 1, 2048, 640, 640, 640, 2560] + - [19, 21520.0] + - - [1024, 4096, 1, 16384, 1024, 1024, 1024, 4096] + - [30, 23280.0] + - - [1536, 6144, 1, 4096, 1536, 1536, 1536, 6144] + - [2, 23799.0] + - - [1024, 30592, 1, 4096, 1024, 1024, 1024, 30592] + - [13, 24064.0] + - - [2560, 2560, 1, 4, 2560, 2560, 2560, 2560] + - [14, 1270.0] + - - [1536, 1536, 1, 4096, 1536, 1536, 1536, 1536] + - [2, 20871.0] + - - [2560, 7680, 1, 2048, 2560, 2560, 2560, 7680] + - [2, 23922.0] + - - [1536, 50304, 1, 4096, 1536, 1536, 1536, 50304] + - [19, 24158.0] + - - [2048, 8192, 1, 1024, 2048, 2048, 2048, 8192] + - [19, 23500.0] + - - [1024, 30592, 1, 8192, 1024, 1024, 1024, 30592] + - [2, 24071.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 4096, 1024] + - [19, 23172.0] + - - [8192, 2048, 1, 1024, 8192, 8192, 8192, 2048] + - [19, 23580.0] + - - [1024, 50304, 1, 4096, 1024, 1024, 1024, 50304] + - [2, 24110.0] + - - [1536, 4608, 1, 4096, 1536, 1536, 1536, 4608] + - [2, 23280.0] + - - [6144, 1536, 1, 8192, 6144, 6144, 6144, 1536] + - [2, 23796.0] + - - [1024, 3072, 1, 8192, 1024, 1024, 1024, 3072] + - [2, 22468.0] + - - [1536, 1536, 1, 8192, 1536, 1536, 1536, 1536] + - [13, 20901.0] + - - [1536, 50304, 1, 8192, 1536, 1536, 1536, 50304] + - [23, 23848.0] + - - [2048, 6144, 1, 1024, 2048, 2048, 2048, 6144] + - [13, 23351.0] + - - [2048, 30592, 1, 1024, 2048, 2048, 2048, 30592] + - [13, 23959.0] + - - [1536, 6144, 1, 8192, 1536, 1536, 1536, 6144] + - [2, 23742.0] + - - [1024, 50304, 1, 2048, 1024, 1024, 1024, 50304] + - [2, 24076.0] + - - [1024, 50304, 1, 8192, 1024, 1024, 1024, 50304] + - [21, 23613.0] + - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] + - [8, 21985.0] + - - [1024, 50304, 1, 16384, 1024, 1024, 1024, 50304] + - [2, 23839.0] + - - [1024, 30528, 1, 8192, 1024, 1024, 1024, 30528] + - [19, 23979.0] + - - [256, 6912, 1, 1, 256, 256, 256, 6912] + - [1, 251.0] + - - [30528, 1024, 1, 640, 30528, 30528, 30528, 1024] + - [2, 23545.0] + - - [30528, 1024, 1, 1280, 30528, 30528, 30528, 1024] + - [13, 23822.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 4096, 1024] + - [8, 23147.0] + - - [1024, 4096, 1, 10240, 1024, 1024, 1024, 4096] + - [2, 23107.0] + - - [30528, 1024, 1, 1600, 30528, 30528, 30528, 1024] + - [30, 23874.0] + - - [1024, 4096, 1, 10496, 1024, 1024, 1024, 4096] + - [19, 23177.0] + - - [30528, 1024, 1, 1640, 30528, 30528, 30528, 1024] + - [23, 23798.0] + - - [4096, 1024, 1, 10496, 4096, 4096, 4096, 1024] + - [19, 23170.0] + - - [30528, 1024, 1, 160, 30528, 30528, 30528, 1024] + - [19, 22083.0] + - - [1024, 4096, 1, 6144, 1024, 1024, 1024, 4096] + - [2, 23121.0] + - - [30528, 1024, 1, 240, 30528, 30528, 30528, 1024] + - [13, 22785.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 4096, 1024] + - [2, 23125.0] + - - [1024, 4096, 1, 10224, 1024, 1024, 1024, 4096] + - [19, 23183.0] + - - [4096, 1024, 1, 10224, 4096, 4096, 4096, 1024] + - [19, 23093.0] + - - [1024, 3072, 1, 10224, 1024, 1024, 1024, 3072] + - [2, 22305.0] + - - [1024, 3072, 1, 10240, 1024, 1024, 1024, 3072] + - [23, 21999.0] + - - [4096, 1024, 1, 10192, 4096, 4096, 4096, 1024] + - [2, 23080.0] + - - [1024, 3072, 1, 10192, 1024, 1024, 1024, 3072] + - [2, 22272.0] + - - [1024, 4096, 1, 10192, 1024, 1024, 1024, 4096] + - [30, 23212.0] + - - [1024, 3072, 1, 10200, 1024, 1024, 1024, 3072] + - [2, 22456.0] + - - [4096, 1024, 1, 10208, 4096, 4096, 4096, 1024] + - [23, 23199.0] + - - [1024, 3072, 1, 10208, 1024, 1024, 1024, 3072] + - [2, 22437.0] + - - [1024, 4096, 1, 10208, 1024, 1024, 1024, 4096] + - [13, 23171.0] + - - [1024, 2048, 1, 10224, 1024, 1024, 1024, 2048] + - [2, 21109.0] + - - [1024, 2048, 1, 10240, 1024, 1024, 1024, 2048] + - [8, 21004.0] + - - [1024, 2048, 1, 10192, 1024, 1024, 1024, 2048] + - [8, 20992.0] + - - [1024, 3072, 1, 10080, 1024, 1024, 1024, 3072] + - [23, 22073.0] + - - [100352, 256, 1, 512, 100352, 100352, 100352, 256] + - [13, 23289.0] + - - [12544, 1024, 1, 2048, 12544, 12544, 12544, 1024] + - [2, 23445.0] + - - [12544, 147, 1, 64, 12544, 12544, 12544, 147] + - [12, 10156.0] + - - [200704, 256, 1, 512, 200704, 200704, 200704, 256] + - [23, 23597.0] + - - [25088, 512, 1, 1024, 25088, 25088, 25088, 512] + - [8, 23270.0] + - - [3136, 576, 1, 64, 3136, 3136, 3136, 576] + - [1, 13107.0] + - - [50176, 512, 1, 1024, 50176, 50176, 50176, 512] + - [13, 23678.0] + - - [6272, 1024, 1, 2048, 6272, 6272, 6272, 1024] + - [8, 23087.0] + - - [3136, 256, 128, 128, 3136, 3136, 3136, 256] + - [2, 21710.0] + - - [3136, 256, 256, 128, 3136, 3136, 3136, 256] + - [8, 21915.0] + - - [784, 512, 128, 256, 784, 784, 784, 512] + - [8, 20019.0] + - - [784, 512, 256, 256, 784, 784, 784, 512] + - [2, 20172.0] + - - [30528, 1024, 1, 2560, 30528, 30528, 30528, 1024] + - [13, 23848.0] + - - [1024, 4096, 1, 12288, 1024, 1024, 1024, 4096] + - [19, 23233.0] + - - [30528, 1024, 1, 1920, 30528, 30528, 30528, 1024] + - [23, 23809.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 4096, 1024] + - [30, 23215.0] + - - [25600, 128, 25, 128, 25600, 25600, 25600, 128] + - [0, 20662.0] + - - [12544, 128, 36, 128, 12544, 12544, 12544, 128] + - [26, 21307.0] + - - [9216, 128, 49, 128, 9216, 9216, 9216, 128] + - [4, 20813.0] + - - [6400, 128, 64, 128, 6400, 6400, 6400, 128] + - [33, 21449.0] + - - [6400, 256, 25, 256, 6400, 6400, 6400, 256] + - [23, 23106.0] + - - [4096, 256, 36, 256, 4096, 4096, 4096, 256] + - [2, 22932.0] + - - [2304, 256, 49, 256, 2304, 2304, 2304, 256] + - [19, 22830.0] + - - [2304, 256, 64, 256, 2304, 2304, 2304, 256] + - [8, 23007.0] + - - [2304, 512, 25, 512, 2304, 2304, 2304, 512] + - [23, 23553.0] + - - [1024, 512, 36, 512, 1024, 1024, 1024, 512] + - [13, 23273.0] + - - [1024, 512, 49, 512, 1024, 1024, 1024, 512] + - [13, 23382.0] + - - [1024, 512, 64, 512, 1024, 1024, 1024, 512] + - [2, 23488.0] + - - [3072, 768, 1, 2048, 3072, 3072, 3072, 768] + - [8, 20622.0] + - - [768, 3072, 1, 2048, 768, 768, 768, 3072] + - [13, 20679.0] + - - [3072, 768, 1, 4608, 3072, 3072, 3072, 768] + - [8, 20940.0] + - - [768, 3072, 1, 4608, 768, 768, 768, 3072] + - [8, 20924.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4096, 1024] + - [2, 23105.0] + - - [1024, 4096, 1, 4608, 1024, 1024, 1024, 4096] + - [30, 22902.0] + - - [196, 1024, 128, 512, 196, 196, 196, 1024] + - [13, 17563.0] + - - [196, 1024, 256, 512, 196, 196, 196, 1024] + - [19, 17714.0] + - - [4880, 256, 49, 256, 4880, 4880, 4880, 256] + - [23, 22548.0] + - - [3128, 256, 64, 256, 3128, 3128, 3128, 256] + - [13, 22495.0] + - - [4680, 256, 49, 256, 4680, 4680, 4680, 256] + - [19, 22763.0] + - - [5280, 256, 36, 256, 5280, 5280, 5280, 256] + - [23, 22525.0] + - - [2640, 256, 64, 256, 2640, 2640, 2640, 256] + - [23, 22427.0] + - - [5304, 256, 49, 256, 5304, 5304, 5304, 256] + - [2, 22754.0] + - - [2760, 256, 64, 256, 2760, 2760, 2760, 256] + - [2, 22445.0] + - - [6440, 256, 36, 256, 6440, 6440, 6440, 256] + - [30, 22715.0] + - - [5704, 256, 36, 256, 5704, 5704, 5704, 256] + - [19, 22768.0] + - - [2128, 256, 64, 256, 2128, 2128, 2128, 256] + - [2, 22273.0] + - - [1160, 256, 49, 256, 1160, 1160, 1160, 256] + - [8, 20203.0] + - - [4056, 256, 49, 256, 4056, 4056, 4056, 256] + - [8, 22753.0] + - - [6144, 256, 36, 256, 6144, 6144, 6144, 256] + - [2, 23123.0] + - - [6336, 256, 36, 256, 6336, 6336, 6336, 256] + - [2, 22819.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 512] + - [8, 20570.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 512] + - [8, 21132.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 128] + - [2, 21367.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 128] + - [2, 20723.0] + - - [5632, 256, 36, 256, 5632, 5632, 5632, 256] + - [30, 23101.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 128] + - [8, 21137.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 128] + - [23, 20538.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 128] + - [8, 20585.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 512] + - [13, 21479.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 128] + - [2, 22086.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 128] + - [2, 19245.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 128] + - [8, 20658.0] + - - [13600, 512, 2, 256, 13600, 13600, 13600, 512] + - [13, 21451.0] + - - [15200, 512, 2, 256, 15200, 15200, 15200, 512] + - [23, 21601.0] + - - [768, 2048, 2, 512, 768, 768, 768, 2048] + - [30, 20559.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 512] + - [8, 19585.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 128] + - [2, 18649.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 512] + - [30, 20784.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 512] + - [3, 19170.0] + - - [6912, 256, 36, 256, 6912, 6912, 6912, 256] + - [2, 23191.0] + - - [13824, 512, 2, 256, 13824, 13824, 13824, 512] + - [2, 22306.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 512] + - [2, 20920.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 512] + - [2, 21521.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 128] + - [30, 20948.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 128] + - [30, 20034.0] + - - [864, 2048, 2, 512, 864, 864, 864, 2048] + - [23, 19890.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 512] + - [8, 19870.0] + - - [672, 2048, 2, 512, 672, 672, 672, 2048] + - [23, 18322.0] + - - [9408, 128, 2, 512, 9408, 9408, 9408, 128] + - [2, 19377.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 512] + - [10, 20484.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 512] + - [14, 19419.0] + - - [1240, 256, 49, 256, 1240, 1240, 1240, 256] + - [8, 21499.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 256] + - [8, 19480.0] + - - [888, 2048, 2, 512, 888, 888, 888, 2048] + - [2, 20747.0] + - - [12880, 512, 2, 256, 12880, 12880, 12880, 512] + - [13, 21800.0] + - - [12288, 512, 2, 256, 12288, 12288, 12288, 512] + - [19, 22382.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 128] + - [8, 21494.0] + - - [864, 2048, 2, 256, 864, 864, 864, 2048] + - [23, 18851.0] + - - [12672, 128, 2, 512, 12672, 12672, 12672, 128] + - [30, 22105.0] + - - [11264, 128, 2, 512, 11264, 11264, 11264, 128] + - [8, 21667.0] + - - [11776, 128, 2, 512, 11776, 11776, 11776, 128] + - [19, 20449.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 128] + - [30, 21593.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 1024] + - [8, 20636.0] + - - [14000, 128, 2, 512, 14000, 14000, 14000, 128] + - [2, 20303.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 512] + - [19, 20803.0] + - - [768, 2048, 2, 256, 768, 768, 768, 2048] + - [2, 19083.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 1024] + - [2, 20368.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 256] + - [13, 19580.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 1024] + - [30, 18735.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 1024] + - [2, 19187.0] + - - [15200, 256, 2, 12, 15200, 15200, 15200, 256] + - [31, 4268.0] + - - [12880, 256, 2, 12, 12880, 12880, 12880, 256] + - [1, 4441.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 1024] + - [30, 19940.0] + - - [13600, 256, 2, 12, 13600, 13600, 13600, 256] + - [7, 4571.0] + - - [15200, 256, 2, 3, 15200, 15200, 15200, 256] + - [7, 1207.0] + - - [12880, 256, 2, 3, 12880, 12880, 12880, 256] + - [5, 1113.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 1024] + - [8, 21379.0] + - - [12288, 256, 2, 12, 12288, 12288, 12288, 256] + - [1, 3544.0] + - - [13824, 256, 2, 12, 13824, 13824, 13824, 256] + - [34, 4217.0] + - - [13600, 256, 2, 3, 13600, 13600, 13600, 256] + - [1, 1070.0] + - - [7600, 512, 1, 256, 7600, 7600, 7600, 512] + - [2, 20430.0] + - - [6144, 512, 1, 256, 6144, 6144, 6144, 512] + - [19, 20295.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [19, 23327.0] + - - [3800, 256, 2, 3, 3800, 3800, 3800, 256] + - [7, 817.0] + - - [13824, 256, 2, 3, 13824, 13824, 13824, 256] + - [27, 988.0] + - - [12288, 256, 2, 3, 12288, 12288, 12288, 256] + - [1, 887.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 256] + - [13, 18317.0] + - - [3072, 256, 2, 12, 3072, 3072, 3072, 256] + - [20, 3457.0] + - - [3800, 256, 2, 12, 3800, 3800, 3800, 256] + - [32, 2527.0] + - - [3072, 256, 2, 3, 3072, 3072, 3072, 256] + - [14, 600.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 256] + - [8, 19602.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 512] + - [19, 21739.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 256] + - [2, 19193.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 256] + - [2, 18925.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 256] + - [23, 19983.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 1024] + - [2, 20668.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 256] + - [13, 19184.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 1024] + - [2, 20743.0] + - - [3456, 256, 2, 3, 3456, 3456, 3456, 256] + - [3, 882.0] + - - [3400, 256, 2, 3, 3400, 3400, 3400, 256] + - [1, 811.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 1024] + - [23, 21623.0] + - - [3456, 256, 2, 12, 3456, 3456, 3456, 256] + - [1, 3349.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 1024] + - [30, 21503.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 256] + - [2, 20526.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 256] + - [2, 19747.0] + - - [51520, 256, 2, 12, 51520, 51520, 51520, 256] + - [16, 7430.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 256] + - [23, 19499.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 1024] + - [8, 20391.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 1024] + - [33, 20947.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 256] + - [8, 18876.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 256] + - [19, 17886.0] + - - [54400, 256, 2, 12, 54400, 54400, 54400, 256] + - [6, 6774.0] + - - [55296, 256, 2, 3, 55296, 55296, 55296, 256] + - [29, 2109.0] + - - [60800, 256, 2, 12, 60800, 60800, 60800, 256] + - [24, 8881.0] + - - [51520, 256, 2, 3, 51520, 51520, 51520, 256] + - [18, 1644.0] + - - [55296, 256, 2, 12, 55296, 55296, 55296, 256] + - [22, 6195.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 1024] + - [13, 20551.0] + - - [60800, 256, 2, 3, 60800, 60800, 60800, 256] + - [22, 2796.0] + - - [952, 256, 64, 256, 952, 952, 952, 256] + - [8, 20644.0] + - - [49152, 256, 2, 12, 49152, 49152, 49152, 256] + - [12, 5653.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 256] + - [8, 19082.0] + - - [736, 256, 64, 256, 736, 736, 736, 256] + - [8, 21009.0] + - - [600, 256, 64, 256, 600, 600, 600, 256] + - [13, 20302.0] + - - [1440, 256, 49, 256, 1440, 1440, 1440, 256] + - [8, 21102.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 1024] + - [2, 21432.0] + - - [1368, 256, 49, 256, 1368, 1368, 1368, 256] + - [19, 21647.0] + - - [49152, 256, 2, 3, 49152, 49152, 49152, 256] + - [12, 1480.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 256] + - [2, 20502.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 1024] + - [2, 20867.0] + - - [54400, 256, 2, 3, 54400, 54400, 54400, 256] + - [14, 2110.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 1024] + - [2, 21344.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 256] + - [2, 17578.0] + - - [616, 256, 64, 256, 616, 616, 616, 256] + - [23, 20643.0] + - - [3008, 256, 64, 256, 3008, 3008, 3008, 256] + - [19, 22452.0] + - - [896, 256, 64, 256, 896, 896, 896, 256] + - [23, 22560.0] + - - [768, 256, 64, 256, 768, 768, 768, 256] + - [2, 22451.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 256] + - [2, 20124.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 1024] + - [0, 21014.0] + - - [800, 256, 64, 256, 800, 800, 800, 256] + - [30, 19815.0] + - - [1120, 256, 49, 256, 1120, 1120, 1120, 256] + - [19, 21460.0] + - - [2408, 256, 64, 256, 2408, 2408, 2408, 256] + - [2, 22587.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 256] + - [19, 19734.0] + - - [672, 256, 64, 256, 672, 672, 672, 256] + - [2, 19216.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 1024] + - [2, 22016.0] + - - [1064, 256, 49, 256, 1064, 1064, 1064, 256] + - [8, 20514.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 256] + - [2, 19351.0] + - - [704, 256, 64, 256, 704, 704, 704, 256] + - [8, 20156.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 1024] + - [2, 21166.0] + - - [3264, 256, 64, 256, 3264, 3264, 3264, 256] + - [19, 22528.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 1024] + - [28, 21235.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 256] + - [30, 21427.0] + - - [6440, 512, 1, 256, 6440, 6440, 6440, 512] + - [2, 18326.0] + - - [6912, 512, 1, 256, 6912, 6912, 6912, 512] + - [19, 21001.0] + - - [6800, 512, 1, 256, 6800, 6800, 6800, 512] + - [2, 19675.0] + - - [6800, 512, 1, 1024, 6800, 6800, 6800, 512] + - [13, 21578.0] + - - [6440, 512, 1, 1024, 6440, 6440, 6440, 512] + - [13, 20167.0] + - - [6912, 512, 1, 1024, 6912, 6912, 6912, 512] + - [13, 21944.0] + - - [1728, 1024, 1, 512, 1728, 1728, 1728, 1024] + - [2, 17463.0] + - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] + - [8, 18711.0] + - - [7600, 512, 1, 1024, 7600, 7600, 7600, 512] + - [13, 21738.0] + - - [6144, 512, 1, 1024, 6144, 6144, 6144, 512] + - [30, 21395.0] + - - [1728, 1024, 1, 2048, 1728, 1728, 1728, 1024] + - [8, 19661.0] + - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] + - [13, 20774.0] + - - [4524, 256, 49, 256, 4524, 4524, 4524, 256] + - [2, 22566.0] + - - [2666, 256, 64, 256, 2666, 2666, 2666, 256] + - [2, 22659.0] + - - [950, 2048, 2, 512, 950, 950, 950, 2048] + - [8, 20003.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 1024] + - [0, 20889.0] + - - [782, 128, 64, 128, 782, 782, 782, 128] + - [23, 17097.0] + - - [850, 2048, 2, 512, 850, 850, 850, 2048] + - [2, 19615.0] + - - [805, 2048, 2, 512, 805, 805, 805, 2048] + - [2, 18658.0] + - - [713, 2048, 2, 512, 713, 713, 713, 2048] + - [2, 19419.0] + - - [660, 2048, 2, 512, 660, 660, 660, 2048] + - [2, 17276.0] + - - [726, 2048, 2, 512, 726, 726, 726, 2048] + - [2, 18965.0] + - - [805, 2048, 2, 256, 805, 805, 805, 2048] + - [2, 16429.0] + - - [1251, 256, 49, 256, 1251, 1251, 1251, 256] + - [8, 21636.0] + - - [1900, 1024, 1, 2048, 1900, 1900, 1900, 1024] + - [2, 21762.0] + - - [1610, 1024, 1, 2048, 1610, 1610, 1610, 1024] + - [2, 18743.0] + - - [1900, 1024, 1, 512, 1900, 1900, 1900, 1024] + - [8, 18746.0] + - - [3220, 256, 2, 12, 3220, 3220, 3220, 256] + - [31, 1940.0] + - - [3220, 256, 2, 3, 3220, 3220, 3220, 256] + - [31, 574.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 1024] + - [2, 20246.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 256] + - [8, 18859.0] + - - [850, 2048, 2, 256, 850, 850, 850, 2048] + - [6, 17367.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 1024] + - [2, 19837.0] + - - [950, 2048, 2, 256, 950, 950, 950, 2048] + - [2, 18225.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 1024] + - [23, 20275.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 256] + - [2, 17998.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 256] + - [2, 20028.0] + - - [1269, 256, 49, 256, 1269, 1269, 1269, 256] + - [13, 21867.0] + - - [1467, 256, 49, 256, 1467, 1467, 1467, 256] + - [13, 21332.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 256] + - [8, 19782.0] + - - [1449, 256, 49, 256, 1449, 1449, 1449, 256] + - [13, 21082.0] + - - [1278, 256, 49, 256, 1278, 1278, 1278, 256] + - [2, 22205.0] + - - [1413, 256, 49, 256, 1413, 1413, 1413, 256] + - [8, 20655.0] + - - [1341, 256, 49, 256, 1341, 1341, 1341, 256] + - [30, 21118.0] + - - [1287, 256, 49, 256, 1287, 1287, 1287, 256] + - [13, 20323.0] + - - [1332, 256, 49, 256, 1332, 1332, 1332, 256] + - [8, 21038.0] + - - [1359, 256, 49, 256, 1359, 1359, 1359, 256] + - [2, 21675.0] + - - [1395, 256, 49, 256, 1395, 1395, 1395, 256] + - [2, 22129.0] + - - [1323, 256, 49, 256, 1323, 1323, 1323, 256] + - [23, 20830.0] + - - [1404, 256, 49, 256, 1404, 1404, 1404, 256] + - [19, 22151.0] + - - [1386, 256, 49, 256, 1386, 1386, 1386, 256] + - [8, 21962.0] + - - [1350, 256, 49, 256, 1350, 1350, 1350, 256] + - [30, 21354.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 1024] + - [2, 20786.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 256] + - [13, 17865.0] + - - [690, 256, 64, 256, 690, 690, 690, 256] + - [2, 19682.0] + - - [660, 256, 64, 256, 660, 660, 660, 256] + - [2, 18950.0] + - - [782, 256, 64, 256, 782, 782, 782, 256] + - [2, 19500.0] + - - [884, 256, 64, 256, 884, 884, 884, 256] + - [2, 21960.0] + - - [1610, 1024, 1, 512, 1610, 1610, 1610, 1024] + - [2, 17361.0] + - - [1700, 1024, 1, 512, 1700, 1700, 1700, 1024] + - [2, 18461.0] + - - [1700, 1024, 1, 2048, 1700, 1700, 1700, 1024] + - [8, 19372.0] + - - [1444, 128, 120, 256, 1444, 1444, 1444, 128] + - [23, 20908.0] + - - [1444, 128, 18, 256, 1444, 1444, 1444, 128] + - [23, 18532.0] + - - [1444, 128, 19, 256, 1444, 1444, 1444, 128] + - [8, 18871.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [19, 21365.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [2, 19830.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [8, 20377.0] + - - [361, 512, 120, 256, 361, 361, 361, 512] + - [30, 20650.0] + - - [361, 512, 18, 256, 361, 361, 361, 512] + - [19, 18459.0] + - - [361, 512, 19, 256, 361, 361, 361, 512] + - [19, 18544.0] + - - [1920, 25216, 1, 16384, 1920, 1920, 1920, 25216] + - [23, 23879.0] + - - [3840, 1920, 1, 16384, 3840, 3840, 3840, 1920] + - [8, 23448.0] + - - [1920, 3840, 1, 16384, 1920, 1920, 1920, 3840] + - [19, 23419.0] + - - [960, 1920, 1, 16384, 960, 960, 960, 1920] + - [2, 21274.0] + - - [1920, 2880, 1, 16384, 1920, 1920, 1920, 2880] + - [30, 22434.0] + - - [1920, 25216, 1, 4096, 1920, 1920, 1920, 25216] + - [13, 24132.0] + - - [3840, 1920, 1, 4096, 3840, 3840, 3840, 1920] + - [2, 23346.0] + - - [1920, 3840, 1, 4096, 1920, 1920, 1920, 3840] + - [2, 23346.0] + - - [960, 1920, 1, 4096, 960, 960, 960, 1920] + - [2, 20767.0] + - - [1920, 2880, 1, 4096, 1920, 1920, 1920, 2880] + - [13, 22103.0] + - - [1920, 25216, 1, 8192, 1920, 1920, 1920, 25216] + - [13, 23847.0] + - - [3840, 1920, 1, 8192, 3840, 3840, 3840, 1920] + - [23, 23407.0] + - - [1920, 3840, 1, 8192, 1920, 1920, 1920, 3840] + - [19, 23419.0] + - - [960, 1920, 1, 8192, 960, 960, 960, 1920] + - [2, 21116.0] + - - [1920, 2880, 1, 8192, 1920, 1920, 1920, 2880] + - [13, 22069.0] + - - [2304, 12672, 1, 16384, 2304, 2304, 2304, 12672] + - [8, 23619.0] + - - [2304, 2304, 1, 16384, 2304, 2304, 2304, 2304] + - [13, 22762.0] + - - [576, 2304, 1, 16384, 576, 576, 576, 2304] + - [2, 18401.0] + - - [2304, 1728, 1, 16384, 2304, 2304, 2304, 1728] + - [13, 22080.0] + - - [2304, 12672, 1, 4096, 2304, 2304, 2304, 12672] + - [2, 23982.0] + - - [2304, 2304, 1, 4096, 2304, 2304, 2304, 2304] + - [8, 22603.0] + - - [576, 2304, 1, 4096, 576, 576, 576, 2304] + - [30, 17928.0] + - - [2304, 1728, 1, 4096, 2304, 2304, 2304, 1728] + - [23, 21787.0] + - - [2304, 12672, 1, 8192, 2304, 2304, 2304, 12672] + - [2, 23568.0] + - - [2304, 2304, 1, 8192, 2304, 2304, 2304, 2304] + - [2, 22653.0] + - - [576, 2304, 1, 8192, 576, 576, 576, 2304] + - [2, 18175.0] + - - [2304, 1728, 1, 8192, 2304, 2304, 2304, 1728] + - [13, 21895.0] + - - [3072, 6400, 1, 4096, 3072, 3072, 3072, 6400] + - [8, 23970.0] + - - [1536, 3072, 1, 4096, 1536, 1536, 1536, 3072] + - [2, 22651.0] + - - [3072, 1536, 1, 4096, 3072, 3072, 3072, 1536] + - [23, 22536.0] + - - [384, 3072, 1, 4096, 384, 384, 384, 3072] + - [19, 19667.0] + - - [3072, 1152, 1, 4096, 3072, 3072, 3072, 1152] + - [30, 22712.0] + - - [3072, 6400, 1, 8192, 3072, 3072, 3072, 6400] + - [2, 24000.0] + - - [1536, 3072, 1, 8192, 1536, 1536, 1536, 3072] + - [2, 22736.0] + - - [3072, 1536, 1, 8192, 3072, 3072, 3072, 1536] + - [2, 22752.0] + - - [384, 3072, 1, 8192, 384, 384, 384, 3072] + - [35, 19765.0] + - - [3072, 1152, 1, 8192, 3072, 3072, 3072, 1152] + - [2, 23005.0] + - - [2048, 2048, 1, 4096, 2048, 2048, 2048, 2048] + - [19, 22954.0] + - - [2048, 2048, 1, 8, 2048, 2048, 2048, 2048] + - [6, 2561.0] + - - [2048, 29000, 1, 199, 2048, 2048, 2048, 29000] + - [19, 22132.0] + - - [2048, 29000, 1, 221, 2048, 2048, 2048, 29000] + - [19, 22282.0] + - - [2048, 29000, 1, 224, 2048, 2048, 2048, 29000] + - [13, 23018.0] + - - [2048, 29000, 1, 229, 2048, 2048, 2048, 29000] + - [13, 22420.0] + - - [2048, 29000, 1, 234, 2048, 2048, 2048, 29000] + - [8, 22488.0] + - - [2048, 29000, 1, 242, 2048, 2048, 2048, 29000] + - [8, 22583.0] + - - [2048, 29000, 1, 246, 2048, 2048, 2048, 29000] + - [13, 22646.0] + - - [2048, 29000, 1, 247, 2048, 2048, 2048, 29000] + - [8, 22516.0] + - - [2048, 29000, 1, 256, 2048, 2048, 2048, 29000] + - [19, 23189.0] + - - [2048, 29000, 1, 262, 2048, 2048, 2048, 29000] + - [8, 22671.0] + - - [2048, 29000, 1, 264, 2048, 2048, 2048, 29000] + - [6, 22925.0] + - - [2048, 29000, 1, 265, 2048, 2048, 2048, 29000] + - [13, 22595.0] + - - [2048, 29000, 1, 274, 2048, 2048, 2048, 29000] + - [8, 22775.0] + - - [2048, 29000, 1, 277, 2048, 2048, 2048, 29000] + - [13, 22673.0] + - - [2048, 29000, 1, 279, 2048, 2048, 2048, 29000] + - [19, 22711.0] + - - [2048, 29000, 1, 288, 2048, 2048, 2048, 29000] + - [19, 23330.0] + - - [2048, 29000, 1, 296, 2048, 2048, 2048, 29000] + - [6, 23041.0] + - - [2048, 29000, 1, 315, 2048, 2048, 2048, 29000] + - [8, 22862.0] + - - [2048, 29000, 1, 335, 2048, 2048, 2048, 29000] + - [13, 22935.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [2, 23558.0] + - - [4096, 2048, 1, 4096, 4096, 4096, 4096, 2048] + - [2, 23536.0] + - - [1024, 29000, 1, 2283, 1024, 1024, 1024, 29000] + - [13, 23850.0] + - - [1024, 29000, 1, 2296, 1024, 1024, 1024, 29000] + - [23, 23885.0] + - - [1024, 29000, 1, 2306, 1024, 1024, 1024, 29000] + - [2, 23897.0] + - - [1024, 29000, 1, 2309, 1024, 1024, 1024, 29000] + - [2, 23890.0] + - - [1024, 29000, 1, 2318, 1024, 1024, 1024, 29000] + - [2, 23893.0] + - - [1024, 29000, 1, 2320, 1024, 1024, 1024, 29000] + - [2, 23961.0] + - - [1024, 29000, 1, 2324, 1024, 1024, 1024, 29000] + - [2, 23899.0] + - - [1024, 29000, 1, 2325, 1024, 1024, 1024, 29000] + - [23, 23890.0] + - - [1024, 29000, 1, 2329, 1024, 1024, 1024, 29000] + - [13, 23889.0] + - - [1024, 29000, 1, 2338, 1024, 1024, 1024, 29000] + - [19, 23905.0] + - - [1024, 29000, 1, 2345, 1024, 1024, 1024, 29000] + - [23, 23890.0] + - - [1024, 29000, 1, 2350, 1024, 1024, 1024, 29000] + - [2, 23904.0] + - - [1024, 29000, 1, 2362, 1024, 1024, 1024, 29000] + - [2, 23912.0] + - - [1024, 29000, 1, 2366, 1024, 1024, 1024, 29000] + - [2, 23900.0] + - - [1024, 29000, 1, 2368, 1024, 1024, 1024, 29000] + - [13, 23969.0] + - - [1024, 29000, 1, 2374, 1024, 1024, 1024, 29000] + - [8, 23905.0] + - - [1024, 29000, 1, 2390, 1024, 1024, 1024, 29000] + - [13, 23912.0] + - - [1024, 29000, 1, 561, 1024, 1024, 1024, 29000] + - [2, 23261.0] + - - [1024, 29000, 1, 574, 1024, 1024, 1024, 29000] + - [8, 23332.0] + - - [1024, 29000, 1, 600, 1024, 1024, 1024, 29000] + - [13, 23393.0] + - - [1024, 29000, 1, 608, 1024, 1024, 1024, 29000] + - [13, 23641.0] + - - [1024, 29000, 1, 615, 1024, 1024, 1024, 29000] + - [19, 23370.0] + - - [1024, 29000, 1, 622, 1024, 1024, 1024, 29000] + - [13, 23418.0] + - - [1024, 29000, 1, 625, 1024, 1024, 1024, 29000] + - [19, 23388.0] + - - [1024, 29000, 1, 626, 1024, 1024, 1024, 29000] + - [23, 23423.0] + - - [1024, 29000, 1, 628, 1024, 1024, 1024, 29000] + - [19, 23432.0] + - - [1024, 29000, 1, 636, 1024, 1024, 1024, 29000] + - [8, 23422.0] + - - [1024, 29000, 1, 651, 1024, 1024, 1024, 29000] + - [23, 23416.0] + - - [1024, 29000, 1, 658, 1024, 1024, 1024, 29000] + - [23, 23459.0] + - - [1024, 29000, 1, 669, 1024, 1024, 1024, 29000] + - [8, 23414.0] + - - [1024, 29000, 1, 670, 1024, 1024, 1024, 29000] + - [8, 23440.0] + - - [1024, 29000, 1, 672, 1024, 1024, 1024, 29000] + - [23, 23677.0] + - - [1024, 29000, 1, 684, 1024, 1024, 1024, 29000] + - [23, 23461.0] + - - [1024, 29000, 1, 716, 1024, 1024, 1024, 29000] + - [19, 23471.0] + - - [1024, 29000, 1, 730, 1024, 1024, 1024, 29000] + - [13, 23495.0] + - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] + - [13, 23329.0] + - - [2560, 2560, 1, 2, 2560, 2560, 2560, 2560] + - [12, 611.0] + - - [2560, 29000, 1, 109, 2560, 2560, 2560, 29000] + - [13, 20438.0] + - - [2560, 29000, 1, 121, 2560, 2560, 2560, 29000] + - [19, 20854.0] + - - [2560, 29000, 1, 27, 2560, 2560, 2560, 29000] + - [0, 9017.0] + - - [2560, 29000, 1, 35, 2560, 2560, 2560, 29000] + - [27, 11189.0] + - - [2560, 29000, 1, 36, 2560, 2560, 2560, 29000] + - [34, 11395.0] + - - [2560, 29000, 1, 39, 2560, 2560, 2560, 29000] + - [27, 12186.0] + - - [2560, 29000, 1, 40, 2560, 2560, 2560, 29000] + - [34, 12471.0] + - - [2560, 29000, 1, 42, 2560, 2560, 2560, 29000] + - [27, 12939.0] + - - [2560, 29000, 1, 43, 2560, 2560, 2560, 29000] + - [16, 13074.0] + - - [2560, 29000, 1, 44, 2560, 2560, 2560, 29000] + - [16, 13454.0] + - - [2560, 29000, 1, 46, 2560, 2560, 2560, 29000] + - [12, 13710.0] + - - [2560, 29000, 1, 48, 2560, 2560, 2560, 29000] + - [24, 14354.0] + - - [2560, 29000, 1, 49, 2560, 2560, 2560, 29000] + - [12, 14530.0] + - - [2560, 29000, 1, 50, 2560, 2560, 2560, 29000] + - [12, 14782.0] + - - [2560, 29000, 1, 51, 2560, 2560, 2560, 29000] + - [12, 14892.0] + - - [2560, 29000, 1, 53, 2560, 2560, 2560, 29000] + - [5, 15282.0] + - - [2560, 29000, 1, 54, 2560, 2560, 2560, 29000] + - [11, 15539.0] + - - [2560, 29000, 1, 55, 2560, 2560, 2560, 29000] + - [11, 15593.0] + - - [2560, 29000, 1, 56, 2560, 2560, 2560, 29000] + - [9, 15829.0] + - - [2560, 29000, 1, 57, 2560, 2560, 2560, 29000] + - [9, 15875.0] + - - [2560, 29000, 1, 58, 2560, 2560, 2560, 29000] + - [18, 16275.0] + - - [2560, 29000, 1, 59, 2560, 2560, 2560, 29000] + - [3, 16192.0] + - - [2560, 29000, 1, 61, 2560, 2560, 2560, 29000] + - [5, 16139.0] + - - [2560, 29000, 1, 63, 2560, 2560, 2560, 29000] + - [9, 16423.0] + - - [2560, 29000, 1, 65, 2560, 2560, 2560, 29000] + - [12, 17754.0] + - - [2560, 29000, 1, 66, 2560, 2560, 2560, 29000] + - [18, 17944.0] + - - [2560, 29000, 1, 67, 2560, 2560, 2560, 29000] + - [12, 17808.0] + - - [2560, 29000, 1, 69, 2560, 2560, 2560, 29000] + - [12, 17890.0] + - - [2560, 29000, 1, 70, 2560, 2560, 2560, 29000] + - [18, 18121.0] + - - [2560, 29000, 1, 71, 2560, 2560, 2560, 29000] + - [12, 17989.0] + - - [2560, 29000, 1, 73, 2560, 2560, 2560, 29000] + - [5, 17974.0] + - - [2560, 29000, 1, 74, 2560, 2560, 2560, 29000] + - [12, 18320.0] + - - [2560, 29000, 1, 75, 2560, 2560, 2560, 29000] + - [5, 17967.0] + - - [2560, 29000, 1, 77, 2560, 2560, 2560, 29000] + - [18, 18007.0] + - - [2560, 29000, 1, 78, 2560, 2560, 2560, 29000] + - [4, 18158.0] + - - [2560, 29000, 1, 80, 2560, 2560, 2560, 29000] + - [8, 20217.0] + - - [2560, 29000, 1, 81, 2560, 2560, 2560, 29000] + - [19, 19019.0] + - - [2560, 29000, 1, 82, 2560, 2560, 2560, 29000] + - [13, 19217.0] + - - [2560, 29000, 1, 83, 2560, 2560, 2560, 29000] + - [13, 19153.0] + - - [2560, 29000, 1, 84, 2560, 2560, 2560, 29000] + - [13, 19375.0] + - - [2560, 29000, 1, 88, 2560, 2560, 2560, 29000] + - [13, 19600.0] + - - [2560, 29000, 1, 89, 2560, 2560, 2560, 29000] + - [13, 19411.0] + - - [2560, 29000, 1, 90, 2560, 2560, 2560, 29000] + - [13, 19689.0] + - - [2560, 29000, 1, 92, 2560, 2560, 2560, 29000] + - [19, 19763.0] + - - [2560, 29000, 1, 95, 2560, 2560, 2560, 29000] + - [19, 19755.0] + - - [2560, 29000, 1, 98, 2560, 2560, 2560, 29000] + - [13, 20163.0] + - - [2560, 4096, 1, 1024, 2560, 2560, 2560, 4096] + - [2, 23583.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 2560] + - [13, 23563.0] + - - [1024, 3072, 1, 32768, 1024, 1024, 1024, 3072] + - [13, 22588.0] + - - [1024, 4096, 1, 32768, 1024, 1024, 1024, 4096] + - [2, 23311.0] + - - [1024, 50304, 1, 32768, 1024, 1024, 1024, 50304] + - [2, 23979.0] + - - [4096, 1024, 1, 32768, 4096, 4096, 4096, 1024] + - [30, 23266.0] + - - [1024, 128, 24, 1024, 1024, 1024, 1024, 128] + - [23, 21821.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [30, 21800.0] + - - [256, 2560, 1, 8976, 256, 256, 256, 2560] + - [36, 21154.0] + - - [256, 2816, 1, 8976, 256, 256, 256, 2816] + - [36, 21046.0] + - - [256, 3328, 1, 8976, 256, 256, 256, 3328] + - [38, 20597.0] + - - [256, 3584, 1, 8976, 256, 256, 256, 3584] + - [36, 20401.0] + - - [256, 3840, 1, 8976, 256, 256, 256, 3840] + - [38, 21243.0] + - - [256, 4096, 1, 8976, 256, 256, 256, 4096] + - [40, 20989.0] + - - [256, 4352, 1, 8976, 256, 256, 256, 4352] + - [39, 21113.0] + - - [480, 1024, 1, 32768, 480, 480, 480, 1024] + - [36, 19271.0] + - - [1024, 256, 1, 21248, 1024, 1024, 1024, 256] + - [36, 17128.0] + - - [1024, 256, 1, 21504, 1024, 1024, 1024, 256] + - [39, 17064.0] + - - [1024, 256, 1, 22016, 1024, 1024, 1024, 256] + - [38, 17024.0] + - - [1024, 256, 1, 28672, 1024, 1024, 1024, 256] + - [36, 17308.0] + - - [1024, 256, 1, 33536, 1024, 1024, 1024, 256] + - [36, 17398.0] + - - [1024, 512, 1, 32768, 1024, 1024, 1024, 512] + - [41, 20388.0] + - - [1024, 1024, 1, 32768, 1024, 1024, 1024, 1024] + - [39, 22645.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 1024, 1024] + - [36, 21829.0] + - - [1024, 1024, 1, 9520, 1024, 1024, 1024, 1024] + - [36, 21811.0] + - - [1024, 1024, 1, 10064, 1024, 1024, 1024, 1024] + - [36, 21844.0] + - - [1024, 1024, 1, 10080, 1024, 1024, 1024, 1024] + - [36, 21853.0] + - - [1024, 1024, 1, 10200, 1024, 1024, 1024, 1024] + - [36, 21869.0] + - - [479, 1024, 1, 32768, 479, 479, 479, 1024] + - [41, 19038.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 1024, 1024] + - [36, 21198.0] + - - [1024, 1024, 1, 9600, 1024, 1024, 1024, 1024] + - [36, 21751.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 1024, 1024] + - [36, 22196.0] + - - [512, 256, 1, 55296, 512, 512, 512, 256] + - [37, 15457.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 1024, 1024] + - [38, 21613.0] + - - [1024, 1024, 1, 10496, 1024, 1024, 1024, 1024] + - [36, 21914.0] + - - [1024, 1024, 1, 10224, 1024, 1024, 1024, 1024] + - [36, 21864.0] + - - [1024, 1024, 1, 10192, 1024, 1024, 1024, 1024] + - [36, 21852.0] + - - [1024, 1024, 1, 10208, 1024, 1024, 1024, 1024] + - [36, 21837.0] + - - [1024, 1024, 1, 10184, 1024, 1024, 1024, 1024] + - [36, 21827.0] + - - [1024, 1024, 1, 10120, 1024, 1024, 1024, 1024] + - [36, 21834.0] + - - [1024, 1024, 1, 10152, 1024, 1024, 1024, 1024] + - [36, 21853.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 1024, 1024] + - [36, 22026.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [63, 12496.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [65, 10620.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [44, 13172.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [44, 13724.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [44, 12055.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [64, 14409.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [43, 15301.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [65, 14372.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [46, 14013.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [53, 9372.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [43, 13233.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [43, 8537.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [43, 14203.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [51, 15735.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [66, 15611.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [59, 10285.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [44, 15053.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [65, 8058.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [68, 16746.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [70, 15043.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [53, 18578.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [53, 16909.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [64, 17622.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [53, 17314.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [42, 14820.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [43, 17065.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [54, 17539.0] + - - [128, 64, 512, 128, 128, 128, 128, 64] + - [72, 18101.0] + - - [512, 64, 64, 512, 512, 512, 512, 64] + - [61, 18674.0] + - - [1024, 1024, 1, 4, 1024, 1024, 1024, 1024] + - [67, 1241.0] + - - [1024, 1024, 1, 32, 1024, 1024, 1024, 1024] + - [59, 8560.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] + - [65, 17533.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 1024] + - [66, 17595.0] + - - [256, 1280, 1, 8976, 256, 256, 256, 1280] + - [56, 17819.0] + - - [257, 4096, 1, 1024, 257, 257, 257, 4096] + - [50, 12802.0] + - - [512, 2048, 1, 256, 512, 512, 512, 2048] + - [44, 13654.0] + - - [560, 1024, 1, 200, 560, 560, 560, 1024] + - [42, 9324.0] + - - [560, 1024, 1, 1600, 560, 560, 560, 1024] + - [58, 15945.0] + - - [1024, 1024, 1, 200, 1024, 1024, 1024, 1024] + - [44, 12819.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] + - [44, 15534.0] + - - [1024, 1024, 1, 960, 1024, 1024, 1024, 1024] + - [44, 16384.0] + - - [1024, 1024, 1, 1600, 1024, 1024, 1024, 1024] + - [55, 16851.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 256] + - [43, 14589.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 1024, 1024] + - [55, 17308.0] + - - [1024, 1024, 1, 3968, 1024, 1024, 1024, 1024] + - [55, 17325.0] + - - [1024, 1024, 1, 6528, 1024, 1024, 1024, 1024] + - [55, 17478.0] + - - [1024, 1024, 1, 7104, 1024, 1024, 1024, 1024] + - [55, 17482.0] + - - [1024, 1024, 1, 7200, 1024, 1024, 1024, 1024] + - [66, 17507.0] + - - [1024, 1024, 1, 8064, 1024, 1024, 1024, 1024] + - [44, 17524.0] + - - [1024, 1024, 1, 8160, 1024, 1024, 1024, 1024] + - [72, 17516.0] + - - [1024, 1024, 1, 3240, 1024, 1024, 1024, 1024] + - [61, 17272.0] + - - [1024, 1024, 1, 3960, 1024, 1024, 1024, 1024] + - [65, 17662.0] + - - [64, 1280, 64, 192, 64, 64, 64, 1280] + - [65, 19351.0] + - - [64, 1280, 64, 320, 64, 64, 64, 1280] + - [64, 20064.0] + - - [64, 1280, 64, 384, 64, 64, 64, 1280] + - [60, 21108.0] + - - [64, 1280, 64, 448, 64, 64, 64, 1280] + - [43, 21066.0] + - - [64, 2048, 64, 192, 64, 64, 64, 2048] + - [49, 18646.0] + - - [64, 2048, 64, 320, 64, 64, 64, 2048] + - [71, 19628.0] + - - [64, 2048, 64, 384, 64, 64, 64, 2048] + - [53, 19501.0] + - - [64, 2048, 64, 448, 64, 64, 64, 2048] + - [42, 20073.0] + - - [5329, 64, 64, 80, 5329, 5329, 5329, 64] + - [51, 18103.0] + - - [64, 1280, 32, 192, 64, 64, 64, 1280] + - [54, 16981.0] + - - [64, 1280, 32, 320, 64, 64, 64, 1280] + - [71, 18316.0] + - - [64, 1280, 32, 384, 64, 64, 64, 1280] + - [49, 18229.0] + - - [64, 1280, 32, 448, 64, 64, 64, 1280] + - [50, 18809.0] + - - [64, 2048, 32, 192, 64, 64, 64, 2048] + - [71, 18763.0] + - - [64, 2048, 32, 320, 64, 64, 64, 2048] + - [64, 19773.0] + - - [64, 2048, 32, 384, 64, 64, 64, 2048] + - [43, 19243.0] + - - [64, 2048, 32, 448, 64, 64, 64, 2048] + - [43, 18513.0] + - - [5329, 64, 32, 80, 5329, 5329, 5329, 64] + - [51, 18039.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 64] + - [54, 20179.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [64, 14504.0] + - - [196, 256, 32, 1024, 196, 196, 196, 256] + - [54, 14597.0] + - - [3136, 64, 64, 128, 3136, 3136, 3136, 64] + - [65, 19508.0] + - - [3136, 64, 32, 128, 3136, 3136, 3136, 64] + - [71, 19235.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [43, 16347.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 960] + - [44, 18518.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [43, 14479.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1024, 1024] + - [67, 157.0] + - - [1024, 1024, 1, 77, 1024, 1024, 1024, 1024] + - [49, 8393.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [54, 11230.0] + - - [1024, 1024, 1, 10, 1024, 1024, 1024, 1024] + - [73, 2351.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [44, 17190.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [54, 19267.0] + - - [1024, 1024, 1, 39, 1024, 1024, 1024, 1024] + - [70, 8082.0] + - - [1024, 1024, 1, 780, 1024, 1024, 1024, 1024] + - [44, 17047.0] + - - [1024, 1024, 1, 4992, 1024, 1024, 1024, 1024] + - [44, 17645.0] + - - [1024, 1024, 1, 308, 1024, 1024, 1024, 1024] + - [44, 16020.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [65, 18579.0] + - - [1024, 1024, 1, 40, 1024, 1024, 1024, 1024] + - [53, 9118.0] + - - [1024, 1024, 1, 800, 1024, 1024, 1024, 1024] + - [44, 17134.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 1024, 1024] + - [44, 17598.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [54, 16185.0] + - - [1024, 1024, 1, 41, 1024, 1024, 1024, 1024] + - [42, 8598.0] + - - [1024, 1024, 1, 820, 1024, 1024, 1024, 1024] + - [66, 16217.0] + - - [1024, 1024, 1, 5248, 1024, 1024, 1024, 1024] + - [55, 17413.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [60, 18809.0] + - - [1024, 1024, 1, 5, 1024, 1024, 1024, 1024] + - [64, 1507.0] + - - [1024, 1024, 1, 385, 1024, 1024, 1024, 1024] + - [44, 16252.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 1024, 1024] + - [66, 17527.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [60, 18109.0] + - - [1024, 1024, 1, 6, 1024, 1024, 1024, 1024] + - [67, 1005.0] + - - [1024, 1024, 1, 462, 1024, 1024, 1024, 1024] + - [44, 15177.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] + - [55, 17189.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [54, 10153.0] + - - [1024, 1024, 1, 8, 1024, 1024, 1024, 1024] + - [59, 1531.0] + - - [1024, 1024, 1, 160, 1024, 1024, 1024, 1024] + - [56, 13640.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [65, 10258.0] + - - [1024, 1024, 1, 9, 1024, 1024, 1024, 1024] + - [54, 1470.0] + - - [1024, 1024, 1, 180, 1024, 1024, 1024, 1024] + - [55, 12288.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1024, 1024] + - [55, 16570.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 512] + - [53, 188.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [64, 18497.0] + - - [1024, 64, 128, 1024, 1024, 1024, 1024, 64] + - [53, 19536.0] + - - [1024, 64, 32, 1024, 1024, 1024, 1024, 64] + - [60, 21484.0] + - - [1024, 96, 64, 1024, 1024, 1024, 1024, 96] + - [42, 15756.0] + - - [1024, 1024, 1, 16, 1024, 1024, 1024, 1024] + - [54, 4033.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [65, 16210.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [59, 19290.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [45, 15753.0] + - - [512, 64, 256, 512, 512, 512, 512, 64] + - [66, 17959.0] + - - [1024, 96, 128, 1024, 1024, 1024, 1024, 96] + - [65, 16200.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [54, 20214.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [53, 18261.0] + - - [512, 64, 128, 512, 512, 512, 512, 64] + - [71, 20021.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [54, 18909.0] + - - [1024, 64, 64, 1024, 1024, 1024, 1024, 64] + - [55, 18999.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [58, 15926.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [64, 17605.0] + - - [1024, 64, 256, 1024, 1024, 1024, 1024, 64] + - [44, 19740.0] + - - [512, 64, 40, 512, 512, 512, 512, 64] + - [44, 17972.0] + - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] + - [61, 12070.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [64, 18911.0] + - - [128, 64, 1024, 128, 128, 128, 128, 64] + - [65, 19473.0] + - - [1024, 1024, 1, 3456, 1024, 1024, 1024, 1024] + - [51, 17597.0] + - - [1024, 1024, 1, 6912, 1024, 1024, 1024, 1024] + - [44, 17696.0] + - - [1024, 1024, 1, 864, 1024, 1024, 1024, 1024] + - [74, 16550.0] + - - [1024, 512, 1, 3456, 1024, 1024, 1024, 512] + - [62, 15897.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 512] + - [48, 16120.0] + - - [1024, 512, 1, 6912, 1024, 1024, 1024, 512] + - [69, 16128.0] + - - [1024, 512, 1, 864, 1024, 1024, 1024, 512] + - [52, 14930.0] + - - [256, 3456, 1, 1, 256, 256, 256, 3456] + - [52, 178.0] + - - [256, 4096, 1, 1, 256, 256, 256, 4096] + - [59, 273.0] + - - [480, 1024, 1, 3456, 480, 480, 480, 1024] + - [48, 14748.0] + - - [480, 1024, 1, 4096, 480, 480, 480, 1024] + - [48, 14841.0] + - - [480, 1024, 1, 6912, 480, 480, 480, 1024] + - [48, 15086.0] + - - [480, 1024, 1, 864, 480, 480, 480, 1024] + - [52, 13664.0] + - - [1024, 1024, 1, 80, 1024, 1024, 1024, 1024] + - [53, 9598.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [54, 17905.0] + - - [128, 64, 1280, 128, 128, 128, 128, 64] + - [66, 18579.0] + - - [1024, 1024, 1, 82, 1024, 1024, 1024, 1024] + - [64, 11975.0] + - - [128, 64, 1312, 128, 128, 128, 128, 64] + - [65, 19139.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [54, 18825.0] + - - [1024, 1024, 1, 12, 1024, 1024, 1024, 1024] + - [55, 3226.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 1024, 1024] + - [60, 17840.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [53, 18722.0] + - - [512, 64, 192, 512, 512, 512, 512, 64] + - [55, 18977.0] + - - [784, 1152, 1, 128, 784, 784, 784, 1152] + - [42, 11312.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [42, 13874.0] + - - [128, 64, 2048, 128, 128, 128, 128, 64] + - [42, 12743.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 1024] + - [55, 13289.0] + - - [128, 64, 1536, 128, 128, 128, 128, 64] + - [61, 18407.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [43, 18378.0] + - - [1024, 1024, 1, 96, 1024, 1024, 1024, 1024] + - [55, 13006.0] + - - [92416, 64, 25, 64, 92416, 92416, 92416, 64] + - [51, 9887.0] + - - [50176, 64, 36, 64, 50176, 50176, 50176, 64] + - [51, 9959.0] + - - [36864, 64, 49, 64, 36864, 36864, 36864, 64] + - [44, 10077.0] + - - [25600, 64, 64, 64, 25600, 25600, 25600, 64] + - [66, 10101.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [53, 13771.0] + - - [128, 64, 192, 128, 128, 128, 128, 64] + - [55, 14717.0] + - - [768, 768, 1, 2048, 768, 768, 768, 768] + - [48, 17896.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [54, 17820.0] + - - [384, 64, 144, 384, 384, 384, 384, 64] + - [71, 21240.0] + - - [768, 768, 1, 4608, 768, 768, 768, 768] + - [69, 18185.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [49, 17683.0] + - - [512, 64, 48, 512, 512, 512, 512, 64] + - [61, 17723.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [65, 13223.0] + - - [128, 64, 256, 128, 128, 128, 128, 64] + - [44, 13503.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [70, 19317.0] + - - [384, 64, 192, 384, 384, 384, 384, 64] + - [70, 19964.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 1024, 1024] + - [55, 17642.0] + - - [196, 2304, 1, 256, 196, 196, 196, 2304] + - [52, 8706.0] + - - [768, 512, 2, 2048, 768, 768, 768, 512] + - [61, 17610.0] + - - [672, 512, 2, 2048, 672, 672, 672, 512] + - [64, 15332.0] + - - [1008, 512, 2, 2048, 1008, 1008, 1008, 512] + - [57, 16690.0] + - - [864, 512, 2, 2048, 864, 864, 864, 512] + - [58, 17799.0] + - - [888, 512, 2, 2048, 888, 888, 888, 512] + - [58, 18091.0] + - - [840, 512, 2, 2048, 840, 840, 840, 512] + - [69, 17223.0] + - - [768, 256, 2, 12, 768, 768, 768, 256] + - [67, 774.0] + - - [864, 256, 2, 3, 864, 864, 864, 256] + - [43, 219.0] + - - [864, 256, 2, 12, 864, 864, 864, 256] + - [42, 1301.0] + - - [768, 256, 2, 3, 768, 768, 768, 256] + - [42, 230.0] + - - [1024, 320, 1, 1024, 1024, 1024, 1024, 320] + - [44, 13741.0] + - - [173280, 64, 1, 128, 173280, 173280, 173280, 64] + - [51, 9593.0] + - - [25992, 64, 1, 128, 25992, 25992, 25992, 64] + - [61, 11472.0] + - - [713, 512, 2, 2048, 713, 713, 713, 512] + - [44, 15945.0] + - - [660, 512, 2, 2048, 660, 660, 660, 512] + - [44, 14826.0] + - - [726, 512, 2, 2048, 726, 726, 726, 512] + - [44, 16228.0] + - - [748, 512, 2, 2048, 748, 748, 748, 512] + - [44, 16763.0] + - - [805, 512, 2, 2048, 805, 805, 805, 512] + - [58, 16532.0] + - - [850, 512, 2, 2048, 850, 850, 850, 512] + - [69, 17411.0] + - - [850, 256, 2, 3, 850, 850, 850, 256] + - [71, 326.0] + - - [805, 256, 2, 12, 805, 805, 805, 256] + - [47, 790.0] + - - [805, 256, 2, 3, 805, 805, 805, 256] + - [55, 203.0] + - - [850, 256, 2, 12, 850, 850, 850, 256] + - [43, 876.0] + - - [950, 256, 2, 12, 950, 950, 950, 256] + - [45, 1474.0] + - - [950, 256, 2, 3, 950, 950, 950, 256] + - [53, 245.0] + - - [100, 512, 120, 128, 100, 100, 100, 512] + - [65, 13934.0] + - - [100, 512, 18, 128, 100, 100, 100, 512] + - [64, 9560.0] + - - [100, 512, 19, 128, 100, 100, 100, 512] + - [64, 9491.0] + - - [1444, 576, 1, 128, 1444, 1444, 1444, 576] + - [44, 12237.0] + - - [27436, 64, 1, 128, 27436, 27436, 27436, 64] + - [53, 14557.0] + - - [361, 2304, 1, 512, 361, 361, 361, 2304] + - [45, 15588.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [56, 16145.0] + - - [1024, 96, 160, 1024, 1024, 1024, 1024, 96] + - [75, 16032.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [45, 15773.0] + - - [1024, 96, 40, 1024, 1024, 1024, 1024, 96] + - [58, 15575.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [48, 16030.0] + - - [1024, 96, 80, 1024, 1024, 1024, 1024, 96] + - [58, 15852.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [48, 15863.0] + - - [1024, 96, 96, 1024, 1024, 1024, 1024, 96] + - [65, 16455.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [67, 15019.0] + - - [1024, 96, 24, 1024, 1024, 1024, 1024, 96] + - [54, 14937.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [49, 15918.0] + - - [1024, 96, 48, 1024, 1024, 1024, 1024, 96] + - [70, 15469.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [60, 14291.0] + - - [1024, 96, 16, 1024, 1024, 1024, 1024, 96] + - [58, 14146.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [58, 15511.0] + - - [1024, 96, 32, 1024, 1024, 1024, 1024, 96] + - [58, 15212.0] + - - [512, 64, 320, 512, 512, 512, 512, 64] + - [57, 17986.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [42, 17575.0] + - - [1024, 1024, 1, 20, 1024, 1024, 1024, 1024] + - [58, 4211.0] + - - [512, 64, 80, 512, 512, 512, 512, 64] + - [44, 18920.0] + - - [1024, 64, 512, 1024, 1024, 1024, 1024, 64] + - [66, 19907.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [59, 19836.0] + - - [512, 256, 1, 32768, 512, 512, 512, 256] + - [78, 15720.0] + - - [1024, 256, 1, 8192, 1024, 1024, 1024, 256] + - [82, 15465.0] + - - [1024, 256, 1, 8448, 1024, 1024, 1024, 256] + - [82, 15556.0] + - - [1024, 256, 1, 9728, 1024, 1024, 1024, 256] + - [80, 15744.0] + - - [1024, 256, 1, 9984, 1024, 1024, 1024, 256] + - [82, 15831.0] + - - [1024, 256, 1, 10496, 1024, 1024, 1024, 256] + - [80, 15906.0] + - - [1024, 256, 1, 11520, 1024, 1024, 1024, 256] + - [79, 16391.0] + - - [1024, 256, 1, 12032, 1024, 1024, 1024, 256] + - [83, 16135.0] + - - [1024, 256, 1, 13568, 1024, 1024, 1024, 256] + - [80, 16284.0] + - - [1024, 256, 1, 14336, 1024, 1024, 1024, 256] + - [82, 16320.0] + - - [1024, 256, 1, 14848, 1024, 1024, 1024, 256] + - [80, 16349.0] + - - [1024, 256, 1, 15104, 1024, 1024, 1024, 256] + - [82, 16382.0] + - - [1024, 256, 1, 15872, 1024, 1024, 1024, 256] + - [82, 16457.0] + - - [1024, 256, 1, 16128, 1024, 1024, 1024, 256] + - [76, 18424.0] + - - [1024, 256, 1, 17152, 1024, 1024, 1024, 256] + - [82, 16601.0] + - - [1024, 256, 1, 17408, 1024, 1024, 1024, 256] + - [77, 16743.0] + - - [1024, 256, 1, 18944, 1024, 1024, 1024, 256] + - [77, 16684.0] + - - [1024, 256, 1, 19712, 1024, 1024, 1024, 256] + - [77, 16708.0] + - - [1024, 256, 1, 19968, 1024, 1024, 1024, 256] + - [77, 16905.0] + - - [256, 128, 1, 55296, 256, 256, 256, 128] + - [81, 10215.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [126, 10209.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [98, 10756.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [126, 11329.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [98, 10752.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [131, 10252.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [92, 5531.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [143, 4283.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [120, 5013.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [158, 10464.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [152, 7627.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [104, 8523.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [100, 9953.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [147, 10775.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [120, 5481.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [126, 9014.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [135, 10893.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [126, 6397.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [98, 9670.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [98, 10788.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [106, 3240.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [92, 3204.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [149, 10331.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [130, 8768.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [98, 6901.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [141, 6380.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [124, 9975.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [118, 4283.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [92, 9458.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [149, 8505.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [98, 8919.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [96, 6324.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [121, 5026.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [106, 8156.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [147, 10068.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [100, 8898.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [110, 7520.0] + - - [33, 32, 200, 33, 33, 33, 33, 32] + - [136, 924.0] + - - [33, 32, 1600, 33, 33, 33, 33, 32] + - [130, 3104.0] + - - [67, 2048, 1, 512, 67, 67, 67, 2048] + - [101, 5388.0] + - - [74, 2048, 1, 512, 74, 74, 74, 2048] + - [101, 5960.0] + - - [74, 2048, 1, 960, 74, 74, 74, 2048] + - [90, 7476.0] + - - [100, 2048, 1, 512, 100, 100, 100, 2048] + - [143, 6264.0] + - - [512, 512, 1, 200, 512, 512, 512, 512] + - [92, 5787.0] + - - [512, 512, 1, 1600, 512, 512, 512, 512] + - [92, 9864.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 256] + - [106, 9406.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [92, 9592.0] + - - [1024, 256, 1, 2304, 1024, 1024, 1024, 256] + - [147, 10100.0] + - - [1024, 256, 1, 2816, 1024, 1024, 1024, 256] + - [98, 10276.0] + - - [1024, 256, 1, 3072, 1024, 1024, 1024, 256] + - [124, 10269.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [98, 10557.0] + - - [1024, 256, 1, 3584, 1024, 1024, 1024, 256] + - [124, 10398.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 256] + - [124, 10457.0] + - - [1024, 256, 1, 4352, 1024, 1024, 1024, 256] + - [147, 10472.0] + - - [1024, 256, 1, 4608, 1024, 1024, 1024, 256] + - [124, 10513.0] + - - [1024, 256, 1, 5120, 1024, 1024, 1024, 256] + - [98, 10537.0] + - - [1024, 256, 1, 5376, 1024, 1024, 1024, 256] + - [147, 10536.0] + - - [1024, 256, 1, 5632, 1024, 1024, 1024, 256] + - [152, 11868.0] + - - [1024, 256, 1, 6144, 1024, 1024, 1024, 256] + - [147, 10553.0] + - - [1024, 256, 1, 6400, 1024, 1024, 1024, 256] + - [147, 10570.0] + - - [1024, 256, 1, 7680, 1024, 1024, 1024, 256] + - [98, 10648.0] + - - [1024, 256, 1, 7936, 1024, 1024, 1024, 256] + - [98, 10650.0] + - - [32, 64, 4608, 32, 32, 32, 32, 64] + - [152, 9190.0] + - - [32, 64, 4608, 35, 32, 32, 32, 64] + - [151, 7761.0] + - - [34, 64, 4736, 24, 34, 34, 34, 64] + - [113, 4332.0] + - - [34, 64, 4736, 34, 34, 34, 34, 64] + - [104, 5639.0] + - - [35, 64, 4608, 35, 35, 35, 35, 64] + - [141, 5509.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [154, 10958.0] + - - [64, 32, 4608, 35, 64, 64, 64, 32] + - [139, 10717.0] + - - [64, 34, 4736, 24, 64, 64, 64, 34] + - [87, 5760.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [87, 6050.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [87, 6100.0] + - - [33, 64, 1920, 33, 33, 33, 33, 64] + - [152, 4986.0] + - - [64, 33, 1920, 33, 64, 64, 64, 33] + - [87, 5239.0] + - - [49, 512, 64, 2048, 49, 49, 49, 512] + - [149, 9862.0] + - - [49, 2048, 64, 512, 49, 49, 49, 2048] + - [100, 9983.0] + - - [49, 512, 32, 2048, 49, 49, 49, 512] + - [100, 9714.0] + - - [49, 2048, 32, 512, 49, 49, 49, 2048] + - [90, 10290.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 1024] + - [113, 9975.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 2048] + - [135, 10077.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 1024] + - [135, 9855.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 2048] + - [160, 9896.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [141, 8631.0] + - - [512, 480, 1, 512, 512, 512, 512, 480] + - [143, 10180.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [120, 9684.0] + - - [256, 864, 1, 1, 256, 256, 256, 864] + - [115, 103.0] + - - [512, 256, 1, 3456, 512, 512, 512, 256] + - [126, 9031.0] + - - [512, 256, 1, 4096, 512, 512, 512, 256] + - [149, 9152.0] + - - [512, 256, 1, 6912, 512, 512, 512, 256] + - [149, 9445.0] + - - [512, 256, 1, 864, 512, 512, 512, 256] + - [107, 7149.0] + - - [49, 4608, 1, 512, 49, 49, 49, 4608] + - [100, 6554.0] + - - [49, 2048, 128, 512, 49, 49, 49, 2048] + - [149, 10091.0] + - - [49, 2048, 256, 512, 49, 49, 49, 2048] + - [100, 10122.0] + - - [49, 512, 128, 2048, 49, 49, 49, 512] + - [135, 9970.0] + - - [49, 512, 256, 2048, 49, 49, 49, 512] + - [112, 10116.0] + - - [56, 512, 64, 512, 56, 56, 56, 512] + - [100, 10973.0] + - - [176, 256, 2, 3, 176, 176, 176, 256] + - [116, 165.0] + - - [176, 256, 2, 12, 176, 176, 176, 256] + - [88, 575.0] + - - [216, 256, 2, 3, 216, 216, 216, 256] + - [93, 81.0] + - - [192, 256, 2, 12, 192, 192, 192, 256] + - [109, 527.0] + - - [192, 256, 2, 3, 192, 192, 192, 256] + - [142, 155.0] + - - [216, 256, 2, 12, 216, 216, 216, 256] + - [140, 644.0] + - - [228, 256, 2, 12, 228, 228, 228, 256] + - [116, 707.0] + - - [228, 256, 2, 3, 228, 228, 228, 256] + - [119, 181.0] + - - [187, 256, 2, 12, 187, 187, 187, 256] + - [114, 580.0] + - - [247, 256, 2, 12, 247, 247, 247, 256] + - [86, 723.0] + - - [187, 256, 2, 3, 187, 187, 187, 256] + - [119, 151.0] + - - [221, 256, 2, 3, 221, 221, 221, 256] + - [128, 84.0] + - - [221, 256, 2, 12, 221, 221, 221, 256] + - [116, 314.0] + - - [247, 256, 2, 3, 247, 247, 247, 256] + - [103, 162.0] + - - [100, 2304, 1, 512, 100, 100, 100, 2304] + - [126, 6703.0] + - - [256, 128, 1, 32768, 256, 256, 256, 128] + - [165, 9285.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [169, 4.0] + - - [2560, 2, 1, 4, 2560, 2560, 2560, 2] + - [169, 5.0] + - - [2048, 2, 1, 8, 2048, 2048, 2048, 2] + - [171, 19.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [170, 3.0] + - - [25, 1152, 1, 256, 25, 25, 25, 1152] + - [155, 1241.0] + - - [9, 1152, 1, 256, 9, 9, 9, 1152] + - [155, 453.0] + - - [13, 512, 1, 32768, 13, 13, 13, 512] + - [162, 3107.0] + - - [1024, 2, 1, 4992, 1024, 1024, 1024, 2] + - [163, 413.0] + - - [1024, 2, 1, 5120, 1024, 1024, 1024, 2] + - [166, 418.0] + - - [1024, 2, 1, 5248, 1024, 1024, 1024, 2] + - [167, 421.0] + - - [256, 128, 1, 6912, 256, 256, 256, 128] + - [161, 6905.0] + - - [13, 512, 1, 55296, 13, 13, 13, 512] + - [164, 3369.0] + - - [13, 512, 1, 6912, 13, 13, 13, 512] + - [168, 1935.0] + - - [768, 2, 1, 4608, 768, 768, 768, 2] + - [166, 404.0] + - - [1024, 2, 1, 4608, 1024, 1024, 1024, 2] + - [166, 393.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [146, 4484.0] + - - [1, 64, 1, 256, 1, 1, 1, 64] + - [88, 3.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [153, 718.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [100, 7669.0] + - - [1, 64, 1, 1280, 1, 1, 1, 64] + - [88, 4.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [112, 5295.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [97, 4078.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [149, 5299.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [111, 679.0] + - - [1, 64, 1, 1, 1, 1, 1, 64] + - [140, 0.04] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [95, 3760.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [126, 6483.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [125, 2615.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [146, 4751.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [99, 1413.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [97, 4728.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [129, 690.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [111, 1572.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [111, 3098.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [100, 5555.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [148, 1325.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [111, 752.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [88, 0.00019011406824717656] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [112, 7100.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [111, 1301.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [109, 1024.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [123, 2460.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [125, 2048.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [153, 768.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [105, 1982.0] + - - [1, 1, 1, 256, 1, 1, 1, 1] + - [88, 0.04] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [146, 1929.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [84, 203.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [148, 1380.0] + - - [1, 1, 1, 1280, 1, 1, 1, 1] + - [88, 0.1] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [91, 2482.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [159, 2759.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [125, 691.0] + - - [1, 1, 1, 3328, 1, 1, 1, 1] + - [88, 0.12] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [146, 3923.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [157, 601.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [99, 2292.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [109, 1216.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [117, 1384.0] + - - [14, 64, 1, 14, 14, 14, 14, 64] + - [88, 3.0] + - - [15, 64, 1, 14, 15, 15, 15, 64] + - [84, 3.0] + - - [15, 64, 1, 15, 15, 15, 15, 64] + - [84, 7.0] + - - [17, 64, 1, 15, 17, 17, 17, 64] + - [126, 6.0] + - - [17, 64, 1, 17, 17, 17, 17, 64] + - [84, 4.0] + - - [21, 64, 1, 17, 21, 21, 21, 64] + - [84, 5.0] + - - [21, 64, 1, 21, 21, 21, 21, 64] + - [84, 6.0] + - - [24, 64, 1, 24, 24, 24, 24, 64] + - [84, 8.0] + - - [30, 64, 1, 30, 30, 30, 30, 64] + - [88, 27.0] + - - [30, 64, 1, 31, 30, 30, 30, 64] + - [116, 27.0] + - - [31, 64, 1, 31, 31, 31, 31, 64] + - [108, 25.0] + - - [32, 64, 1, 32, 32, 32, 32, 64] + - [142, 17.0] + - - [32, 64, 1, 35, 32, 32, 32, 64] + - [88, 16.0] + - - [34, 64, 1, 24, 34, 34, 34, 64] + - [84, 12.0] + - - [34, 64, 1, 34, 34, 34, 34, 64] + - [117, 17.0] + - - [35, 64, 1, 35, 35, 35, 35, 64] + - [155, 18.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [84, 3.0] + - - [64, 15, 1, 14, 64, 64, 64, 15] + - [84, 3.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [103, 7.0] + - - [64, 17, 1, 15, 64, 64, 64, 17] + - [105, 8.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [95, 8.0] + - - [64, 21, 1, 17, 64, 64, 64, 21] + - [144, 11.0] + - - [64, 21, 1, 21, 64, 64, 64, 21] + - [150, 13.0] + - - [64, 24, 1, 24, 64, 64, 64, 24] + - [89, 17.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [138, 26.0] + - - [64, 30, 1, 31, 64, 64, 64, 30] + - [142, 24.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [128, 27.0] + - - [64, 32, 1, 32, 64, 64, 64, 32] + - [125, 36.0] + - - [64, 32, 1, 35, 64, 64, 64, 32] + - [144, 16.0] + - - [64, 34, 1, 24, 64, 64, 64, 34] + - [85, 18.0] + - - [64, 34, 1, 34, 64, 64, 64, 34] + - [86, 16.0] + - - [64, 35, 1, 35, 64, 64, 64, 35] + - [88, 17.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [134, 2273.0] + - - [512, 64, 1, 512, 512, 512, 512, 64] + - [132, 2873.0] + - - [1024, 2, 1, 4, 1024, 1024, 1024, 2] + - [138, 5.0] + - - [1024, 2, 1, 32, 1024, 1024, 1024, 2] + - [157, 29.0] + - - [1024, 2, 1, 2048, 1024, 1024, 1024, 2] + - [123, 319.0] + - - [3, 64, 512, 3, 3, 3, 3, 64] + - [96, 54.0] + - - [5, 64, 512, 5, 5, 5, 5, 64] + - [89, 130.0] + - - [5, 64, 960, 5, 5, 5, 5, 64] + - [85, 195.0] + - - [9, 64, 512, 9, 9, 9, 9, 64] + - [116, 442.0] + - - [27, 128, 32768, 27, 27, 27, 27, 128] + - [156, 3963.0] + - - [512, 32, 1, 200, 512, 512, 512, 32] + - [105, 876.0] + - - [512, 32, 1, 1600, 512, 512, 512, 32] + - [99, 2893.0] + - - [1024, 64, 1, 512, 1024, 1024, 1024, 64] + - [110, 4821.0] + - - [1024, 64, 1, 960, 1024, 1024, 1024, 64] + - [112, 5992.0] + - - [14, 64, 10880, 14, 14, 14, 14, 64] + - [141, 2327.0] + - - [15, 64, 10880, 14, 15, 15, 15, 64] + - [90, 2485.0] + - - [15, 64, 7680, 15, 15, 15, 15, 64] + - [90, 2479.0] + - - [15, 64, 10880, 15, 15, 15, 15, 64] + - [90, 2543.0] + - - [17, 64, 7680, 15, 17, 17, 17, 64] + - [90, 2305.0] + - - [17, 64, 6144, 17, 17, 17, 17, 64] + - [90, 2549.0] + - - [17, 64, 7680, 17, 17, 17, 17, 64] + - [141, 2605.0] + - - [21, 64, 6144, 17, 21, 21, 21, 64] + - [94, 2356.0] + - - [21, 64, 6144, 21, 21, 21, 21, 64] + - [137, 2753.0] + - - [24, 64, 4736, 24, 24, 24, 24, 64] + - [85, 3776.0] + - - [30, 64, 2048, 30, 30, 30, 30, 64] + - [122, 4216.0] + - - [30, 64, 2048, 31, 30, 30, 30, 64] + - [145, 4320.0] + - - [31, 64, 2048, 31, 31, 31, 31, 64] + - [122, 4460.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [115, 2754.0] + - - [64, 15, 10880, 14, 64, 64, 64, 15] + - [139, 2900.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [102, 2709.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [139, 2975.0] + - - [64, 17, 7680, 15, 64, 64, 64, 17] + - [87, 3273.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [120, 3529.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [143, 3076.0] + - - [64, 21, 6144, 17, 64, 64, 64, 21] + - [131, 3447.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [90, 4088.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [87, 5709.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [115, 6056.0] + - - [64, 30, 2048, 31, 64, 64, 64, 30] + - [87, 6058.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [87, 6079.0] + - - [27, 64, 1920, 27, 27, 27, 27, 64] + - [104, 3575.0] + - - [27, 64, 1920, 33, 27, 27, 27, 64] + - [94, 4182.0] + - - [64, 27, 1920, 27, 64, 64, 64, 27] + - [102, 5208.0] + - - [64, 27, 1920, 33, 64, 64, 64, 27] + - [139, 5756.0] + - - [1024, 2, 1, 1, 1024, 1024, 1024, 2] + - [84, 1.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 2] + - [109, 212.0] + - - [1024, 2, 1, 10, 1024, 1024, 1024, 2] + - [84, 5.0] + - - [1024, 2, 1, 1280, 1024, 1024, 1024, 2] + - [109, 232.0] + - - [1024, 2, 1, 39, 1024, 1024, 1024, 2] + - [84, 17.0] + - - [1024, 2, 1, 40, 1024, 1024, 1024, 2] + - [88, 19.0] + - - [1024, 2, 1, 41, 1024, 1024, 1024, 2] + - [130, 25.0] + - - [1024, 2, 1, 5, 1024, 1024, 1024, 2] + - [126, 4.0] + - - [1024, 2, 1, 2560, 1024, 1024, 1024, 2] + - [109, 288.0] + - - [1024, 2, 1, 6, 1024, 1024, 1024, 2] + - [84, 3.0] + - - [1024, 2, 1, 3072, 1024, 1024, 1024, 2] + - [157, 303.0] + - - [1024, 2, 1, 8, 1024, 1024, 1024, 2] + - [84, 4.0] + - - [1024, 2, 1, 1024, 1024, 1024, 1024, 2] + - [157, 219.0] + - - [1024, 2, 1, 9, 1024, 1024, 1024, 2] + - [119, 10.0] + - - [1024, 2, 1, 1152, 1024, 1024, 1024, 2] + - [123, 229.0] + - - [4, 64, 32768, 4, 4, 4, 4, 64] + - [85, 302.0] + - - [4, 64, 38400, 4, 4, 4, 4, 64] + - [85, 301.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [87, 308.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [87, 309.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [153, 448.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [105, 437.0] + - - [5, 64, 1, 5, 5, 5, 5, 64] + - [116, 0.39] + - - [33, 32, 1, 33, 33, 33, 33, 32] + - [89, 8.0] + - - [1024, 2, 1, 16, 1024, 1024, 1024, 2] + - [86, 8.0] + - - [1024, 2, 1, 64, 1024, 1024, 1024, 2] + - [108, 49.0] + - - [256, 128, 1, 3456, 256, 256, 256, 128] + - [148, 4303.0] + - - [256, 128, 1, 4096, 256, 256, 256, 128] + - [99, 4433.0] + - - [256, 128, 1, 864, 256, 256, 256, 128] + - [99, 2993.0] + - - [1024, 2, 1, 80, 1024, 1024, 1024, 2] + - [144, 35.0] + - - [1024, 2, 1, 82, 1024, 1024, 1024, 2] + - [89, 35.0] + - - [1024, 2, 1, 12, 1024, 1024, 1024, 2] + - [84, 6.0] + - - [13, 512, 1, 3456, 13, 13, 13, 512] + - [123, 1113.0] + - - [13, 512, 1, 4096, 13, 13, 13, 512] + - [125, 1142.0] + - - [13, 512, 1, 864, 13, 13, 13, 512] + - [159, 928.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [127, 5408.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [143, 6379.0] + - - [1024, 2, 1, 128, 1024, 1024, 1024, 2] + - [155, 99.0] + - - [1024, 2, 1, 96, 1024, 1024, 1024, 2] + - [91, 77.0] + - - [768, 2, 1, 2048, 768, 768, 768, 2] + - [109, 208.0] + - - [1024, 81, 1, 1024, 1024, 1024, 1024, 81] + - [92, 5355.0] + - - [25, 256, 120, 128, 25, 25, 25, 256] + - [85, 4990.0] + - - [25, 256, 18, 128, 25, 25, 25, 256] + - [133, 2150.0] + - - [25, 256, 19, 128, 25, 25, 25, 256] + - [99, 2243.0] + - - [9, 256, 120, 128, 9, 9, 9, 256] + - [84, 2198.0] + - - [9, 256, 18, 128, 9, 9, 9, 256] + - [117, 938.0] + - - [9, 256, 19, 128, 9, 9, 9, 256] + - [89, 950.0] + - - [1024, 2, 1, 20, 1024, 1024, 1024, 2] + - [84, 9.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB_GB.yaml new file mode 100644 index 000000000..6a7c379d8 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_HB_GB.yaml @@ -0,0 +1,27342 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT256x128x8_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 4288] + - [0, 24307.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 5888] + - [12, 23701.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 5056] + - [23, 24704.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1024] + - [15, 24164.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 1856] + - [27, 24224.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 704] + - [8, 21280.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 2944] + - [27, 25528.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 4288] + - [8, 22117.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 5056] + - [17, 25021.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 5888] + - [17, 24438.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3584] + - [4, 19732.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1408] + - [27, 24528.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 1856] + - [15, 24354.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 6784] + - [15, 25259.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 5056] + - [19, 24844.0] + - - [448, 5056, 1, 256, 448, 448, 448, 5056] + - [18, 18076.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 448] + - [26, 21697.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 704] + - [26, 22981.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 1024] + - [1, 21993.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 4288] + - [15, 25042.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 2368] + - [26, 22757.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 2944] + - [2, 25429.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 1024] + - [12, 23155.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 2944] + - [12, 22772.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 5056] + - [29, 25361.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 5056] + - [12, 24026.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 3584] + - [17, 23050.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 2944] + - [12, 24116.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 4288] + - [12, 24054.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 4288] + - [27, 24773.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 704] + - [1, 20957.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 4288] + - [13, 24914.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 2368] + - [15, 24315.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 448] + - [14, 22734.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 2944] + - [10, 23334.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 2368] + - [27, 24312.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 3584] + - [10, 19276.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 5888] + - [27, 25180.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 1408] + - [1, 20460.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 2368] + - [14, 22909.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 704] + - [1, 23428.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 1856] + - [26, 20150.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 1856] + - [24, 22916.0] + - - [704, 5888, 1, 256, 704, 704, 704, 5888] + - [3, 19688.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 6784] + - [27, 25040.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 704] + - [1, 22666.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 1408] + - [14, 18362.0] + - - [448, 4288, 1, 256, 448, 448, 448, 4288] + - [22, 16145.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 2368] + - [16, 19537.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 2368] + - [26, 22721.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1024] + - [1, 22224.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 704] + - [33, 22246.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 3584] + - [7, 22873.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 4288] + - [2, 24932.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1856] + - [1, 24000.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 1024] + - [4, 24804.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 3584] + - [18, 24091.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3584] + - [2, 24860.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 2944] + - [0, 24788.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 2368] + - [26, 21525.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 2368] + - [9, 23887.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 2368] + - [14, 21687.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 6784] + - [2, 25386.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 1856] + - [15, 24466.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 4288] + - [15, 24777.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 5056] + - [24, 24557.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 5888] + - [17, 25710.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 5056] + - [24, 24238.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 5056] + - [15, 24285.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 1024] + - [0, 21835.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 1408] + - [3, 21212.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 448] + - [14, 22269.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 5888] + - [11, 22605.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 6784] + - [15, 24209.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 2368] + - [15, 24488.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 2944] + - [9, 24734.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 1024] + - [4, 22682.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 5056] + - [23, 24960.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 1856] + - [14, 22471.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 2368] + - [1, 21073.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 4288] + - [21, 24635.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 6784] + - [2, 22673.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 5888] + - [13, 25730.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1024] + - [7, 24186.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 5888] + - [1, 20065.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 5888] + - [29, 25232.0] + - - [448, 6784, 1, 256, 448, 448, 448, 6784] + - [16, 18596.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 5888] + - [24, 24709.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 3584] + - [32, 23938.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 2944] + - [15, 25594.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 5056] + - [4, 25215.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [15, 25187.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 5888] + - [12, 24877.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 4288] + - [24, 23826.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1856] + - [1, 23384.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 2944] + - [17, 23360.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 6784] + - [1, 22172.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 1024] + - [26, 21433.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 1856] + - [27, 24224.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 1408] + - [8, 20458.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 256] + - [3, 23400.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 3584] + - [32, 24351.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1024] + - [3, 22963.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1856] + - [1, 22038.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 6784] + - [15, 25553.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 1024] + - [3, 23174.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 1024] + - [24, 22466.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 5888] + - [25, 25373.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 1024] + - [3, 19462.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 1408] + - [27, 24888.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 1024] + - [29, 23981.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 1408] + - [2, 24311.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 4288] + - [4, 24616.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 2944] + - [12, 24069.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 1856] + - [33, 21571.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3584] + - [4, 24382.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 6784] + - [12, 23742.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1408] + - [2, 24507.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 5888] + - [17, 24828.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 5056] + - [32, 22923.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 6784] + - [12, 25183.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 4288] + - [2, 23710.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1856] + - [12, 24013.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 5056] + - [0, 24529.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 448] + - [1, 18890.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [15, 24990.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [1, 20301.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3584] + - [27, 24300.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 256] + - [3, 22745.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 3584] + - [12, 24183.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 4288] + - [2, 24987.0] + - - [704, 5056, 1, 256, 704, 704, 704, 5056] + - [14, 20000.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 2368] + - [27, 24072.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 3584] + - [27, 24775.0] + - - [704, 6784, 1, 256, 704, 704, 704, 6784] + - [14, 20493.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3584] + - [4, 23355.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 2944] + - [2, 24914.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 6784] + - [18, 24615.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 4288] + - [27, 24377.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 256] + - [1, 21953.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 704] + - [26, 21102.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 6784] + - [2, 24767.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 1856] + - [8, 23099.0] + - - [704, 4288, 1, 256, 704, 704, 704, 4288] + - [8, 19664.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 6784] + - [15, 24608.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 3584] + - [17, 24965.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 2368] + - [27, 24508.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 4288] + - [27, 23606.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 3584] + - [29, 24085.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1408] + - [1, 22192.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 6784] + - [27, 24571.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 704] + - [8, 22990.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 4288] + - [29, 23734.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 5888] + - [0, 23713.0] + - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [13, 25465.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 704] + - [1, 22160.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 448] + - [30, 21950.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 2368] + - [3, 20454.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 5056] + - [4, 21114.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 448] + - [14, 17268.0] + - - [448, 5888, 1, 256, 448, 448, 448, 5888] + - [33, 17404.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 2368] + - [24, 23579.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 704] + - [1, 23252.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 2944] + - [2, 24276.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 704] + - [28, 17462.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 2368] + - [26, 23352.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 5056] + - [13, 25249.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3584] + - [17, 25092.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 6784] + - [24, 25201.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 2944] + - [2, 24574.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 704] + - [1, 22194.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 5056] + - [26, 21394.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 5888] + - [17, 25690.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 4288] + - [27, 24952.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1856] + - [1, 23235.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 448] + - [1, 22392.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 5888] + - [4, 21898.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 6784] + - [2, 24371.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 2944] + - [3, 20882.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 2944] + - [27, 25174.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 1408] + - [1, 21789.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 4288] + - [16, 19313.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 704] + - [14, 19562.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 1408] + - [27, 23941.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1024] + - [6, 21444.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 6784] + - [7, 23504.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 448] + - [8, 22612.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 3584] + - [29, 24866.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [29, 24370.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 1024] + - [9, 24446.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 6784] + - [18, 22015.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 2944] + - [14, 22690.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 5056] + - [27, 21564.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 5888] + - [2, 23910.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 1856] + - [14, 22525.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 5056] + - [15, 24194.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 5056] + - [13, 25387.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 6784] + - [2, 25287.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 5888] + - [18, 23151.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 4288] + - [17, 24756.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1408] + - [0, 23877.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 2368] + - [14, 23106.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 5056] + - [17, 25049.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 2368] + - [2, 24570.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 1856] + - [15, 23623.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 2944] + - [9, 22988.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 5888] + - [2, 25460.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1024] + - [4, 23109.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 4288] + - [20, 22339.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3584] + - [27, 25404.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3584] + - [17, 25277.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1408] + - [1, 22592.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 2944] + - [18, 25029.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 6784] + - [7, 23265.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 2944] + - [12, 24010.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1408] + - [12, 23950.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 6784] + - [25, 25402.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 4288] + - [26, 23553.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 6784] + - [27, 24476.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 1408] + - [7, 23169.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 5888] + - [25, 25186.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1024] + - [0, 23607.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 6784] + - [2, 20683.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1408] + - [27, 23575.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 1856] + - [1, 23561.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 5888] + - [15, 25349.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1856] + - [12, 24336.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 256] + - [3, 22155.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 5888] + - [15, 24259.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 1408] + - [3, 22800.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3584] + - [1, 21258.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 448] + - [14, 22210.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 704] + - [20, 20510.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 1024] + - [1, 21449.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 4288] + - [15, 23918.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 5056] + - [0, 24586.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 5056] + - [17, 25249.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 5888] + - [0, 24262.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 6784] + - [0, 24873.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 2368] + - [14, 22735.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 3584] + - [32, 24014.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 3584] + - [17, 25167.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [16, 19688.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 1408] + - [1, 21918.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 4288] + - [23, 23870.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 2368] + - [27, 24408.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1856] + - [3, 22463.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 1856] + - [26, 23680.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 5888] + - [7, 23162.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 2368] + - [26, 21852.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 704] + - [1, 22754.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 2944] + - [27, 24758.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 256] + - [1, 22244.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 5056] + - [26, 22263.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [10, 16501.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 4288] + - [24, 22407.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 2368] + - [15, 24542.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 5888] + - [17, 24577.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 256] + - [16, 21811.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 1856] + - [26, 22738.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 704] + - [1, 19141.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 6784] + - [2, 24485.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 4288] + - [15, 23430.0] + - - [704, 3584, 1, 256, 704, 704, 704, 3584] + - [1, 19693.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 2944] + - [3, 21818.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 5056] + - [17, 24284.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 5056] + - [15, 25180.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 1024] + - [4, 23000.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 6784] + - [32, 23796.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 1408] + - [15, 24748.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 2368] + - [14, 22950.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 2944] + - [2, 25496.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 448] + - [5, 21666.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 6784] + - [27, 25587.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 5056] + - [3, 20309.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 704] + - [14, 21097.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 5888] + - [10, 22008.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 4288] + - [27, 24318.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [16, 19809.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1024] + - [17, 24314.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 704] + - [14, 22471.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 3584] + - [15, 25326.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 2944] + - [2, 23922.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 2368] + - [26, 22845.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 3584] + - [1, 20869.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 2944] + - [0, 22236.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 5888] + - [2, 24799.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 1856] + - [26, 23093.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 4288] + - [1, 20894.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 2944] + - [27, 24407.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 5056] + - [17, 24944.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 5056] + - [3, 22598.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 2368] + - [32, 23326.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 704] + - [8, 22695.0] + - - [448, 3584, 1, 256, 448, 448, 448, 3584] + - [16, 15797.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 1408] + - [14, 22460.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [13, 25398.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 3584] + - [32, 24697.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 1856] + - [1, 23509.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1408] + - [27, 24608.0] + - - [704, 2944, 1, 256, 704, 704, 704, 2944] + - [16, 19507.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 5888] + - [27, 25544.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 6784] + - [15, 24740.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1408] + - [1, 21589.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 4288] + - [16, 19967.0] + - - [704, 2368, 1, 256, 704, 704, 704, 2368] + - [14, 16364.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 2368] + - [27, 24673.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 5056] + - [18, 23973.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 448] + - [8, 22224.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 704] + - [1, 23326.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3584] + - [17, 25009.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 6784] + - [15, 25630.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 5056] + - [12, 22354.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 2944] + - [7, 24366.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 3584] + - [0, 22933.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 2368] + - [24, 24071.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 1856] + - [8, 21198.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 1408] + - [32, 24348.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 5056] + - [29, 24774.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 6784] + - [25, 25692.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 1408] + - [15, 24679.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [26, 19700.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 1024] + - [1, 20147.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 5056] + - [32, 23404.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 5056] + - [1, 22781.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 1408] + - [1, 22981.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 448] + - [26, 19472.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 5056] + - [17, 25251.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 6784] + - [15, 24922.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 1408] + - [0, 23644.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 4288] + - [18, 24224.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 5888] + - [18, 24651.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 1024] + - [10, 20487.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 6784] + - [27, 24279.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 3584] + - [0, 24140.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1856] + - [15, 23978.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 1024] + - [1, 22572.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 3584] + - [15, 25329.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 2944] + - [14, 21545.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 6784] + - [12, 20473.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 1024] + - [17, 23208.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1856] + - [14, 23456.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3584] + - [2, 24337.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 5888] + - [15, 25406.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 3584] + - [12, 24625.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 5888] + - [2, 24063.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 448] + - [1, 23119.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 1408] + - [31, 22017.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 2368] + - [2, 23742.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 5056] + - [32, 24163.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 2368] + - [26, 23758.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 1856] + - [14, 22718.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 2944] + - [1, 21918.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1408] + - [1, 21814.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 4288] + - [26, 23125.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 5056] + - [29, 24526.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 1856] + - [3, 23184.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 3584] + - [1, 20689.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 6784] + - [12, 24410.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 2944] + - [21, 24874.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 2944] + - [7, 24788.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 4288] + - [12, 24012.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 1024] + - [4, 24498.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 5888] + - [13, 25657.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 5888] + - [1, 19651.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 1408] + - [3, 21542.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 2944] + - [14, 22238.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 5888] + - [4, 24744.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1856] + - [1, 22647.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 4288] + - [2, 25021.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 704] + - [1, 22960.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 448] + - [7, 16001.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 5056] + - [27, 23988.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1024] + - [14, 22672.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 4288] + - [8, 22770.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 2368] + - [1, 23185.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 5888] + - [27, 25019.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 6784] + - [2, 25455.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 6784] + - [14, 21984.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 3584] + - [29, 21469.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 5888] + - [28, 21132.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [15, 25550.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 2368] + - [1, 22625.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 1856] + - [26, 20634.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3584] + - [15, 25393.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 5888] + - [2, 24393.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 2368] + - [1, 22681.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 2944] + - [7, 23132.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [1, 17985.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 1408] + - [0, 22875.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 6784] + - [25, 25771.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 4288] + - [1, 20449.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 448] + - [26, 22831.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [31, 15826.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [7, 20825.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [7, 20812.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [7, 18525.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [12, 17987.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [12, 18220.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [0, 21974.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 14335.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [12, 21600.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [0, 21300.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [32, 21569.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [12, 17817.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [0, 18504.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [2, 18937.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 18684.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 1024] + - [36, 20341.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [51, 12029.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 6784] + - [46, 20522.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 4288] + - [36, 21599.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 1856] + - [52, 21375.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1024] + - [51, 16348.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 128] + - [39, 7791.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1856] + - [35, 16699.0] + - - [448, 704, 1, 1280, 448, 448, 448, 704] + - [44, 14306.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 3584] + - [36, 16248.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [51, 15717.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [52, 15560.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 256] + - [53, 20053.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 4288] + - [36, 20997.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 448] + - [36, 19215.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 128] + - [50, 19905.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 128] + - [42, 16418.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 128] + - [34, 8691.0] + - - [448, 1408, 1, 256, 448, 448, 448, 1408] + - [51, 12655.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [52, 16722.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 256] + - [52, 17076.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3584] + - [46, 21172.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [35, 14594.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 704] + - [39, 12710.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 128] + - [44, 15675.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [39, 8492.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 256] + - [52, 16572.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [44, 15498.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 1408] + - [44, 14322.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 128] + - [54, 20046.0] + - - [704, 704, 1, 3328, 704, 704, 704, 704] + - [44, 15733.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 256] + - [54, 21152.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3584] + - [58, 18042.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 2944] + - [35, 13473.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1856] + - [44, 17732.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 128] + - [39, 10107.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 1408] + - [52, 19838.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [47, 14428.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 2944] + - [36, 19546.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 704] + - [34, 14753.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 2944] + - [35, 20734.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [48, 9330.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 2368] + - [34, 18981.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 4288] + - [46, 17496.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [56, 12200.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 448] + - [35, 17949.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 704] + - [48, 20653.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [36, 19279.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [39, 13036.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 6784] + - [46, 18788.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 704] + - [35, 19603.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 5888] + - [58, 18168.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 1408] + - [44, 19620.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 128] + - [43, 13473.0] + - - [704, 448, 1, 256, 704, 704, 704, 448] + - [39, 8239.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 1856] + - [36, 18568.0] + - - [128, 4288, 1, 256, 128, 128, 128, 4288] + - [56, 10629.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [48, 17100.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 256] + - [41, 18298.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 704] + - [43, 16843.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 448] + - [53, 19767.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [34, 12336.0] + - - [704, 1856, 1, 256, 704, 704, 704, 1856] + - [34, 16843.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [48, 13413.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 2368] + - [53, 17421.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 448] + - [36, 15926.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 128] + - [46, 13886.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [52, 14523.0] + - - [704, 448, 1, 3328, 704, 704, 704, 448] + - [52, 16524.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [57, 16758.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 448] + - [48, 19082.0] + - - [128, 3584, 1, 256, 128, 128, 128, 3584] + - [51, 10393.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 448] + - [35, 18779.0] + - - [128, 5056, 1, 256, 128, 128, 128, 5056] + - [43, 12457.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 256] + - [36, 19768.0] + - - [704, 704, 1, 256, 704, 704, 704, 704] + - [39, 10006.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 128] + - [37, 17547.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1408] + - [51, 12916.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [55, 17430.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 2944] + - [52, 14908.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 448] + - [35, 20675.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 448] + - [53, 20161.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [56, 7904.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 5056] + - [46, 20323.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [43, 9733.0] + - - [128, 2368, 1, 256, 128, 128, 128, 2368] + - [39, 7563.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [39, 14305.0] + - - [128, 2944, 1, 256, 128, 128, 128, 2944] + - [39, 8433.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 128] + - [40, 18448.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 448] + - [43, 16357.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 128] + - [53, 19546.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 704] + - [48, 18825.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1408] + - [51, 17895.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1408] + - [35, 18626.0] + - - [448, 2944, 1, 256, 448, 448, 448, 2944] + - [51, 16535.0] + - - [448, 2368, 1, 256, 448, 448, 448, 2368] + - [34, 15121.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [34, 7684.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 128] + - [37, 20866.0] + - - [448, 704, 1, 256, 448, 448, 448, 704] + - [39, 8090.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 256] + - [40, 18585.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 128] + - [49, 15084.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [48, 15243.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1024] + - [53, 17732.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [57, 16707.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 2368] + - [46, 19648.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 256] + - [35, 13756.0] + - - [704, 448, 1, 1280, 704, 704, 704, 448] + - [52, 15539.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 704] + - [36, 18243.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [39, 13515.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1856] + - [43, 21251.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 256] + - [52, 14701.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 128] + - [39, 15361.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 448] + - [56, 17276.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 256] + - [45, 21083.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [34, 13161.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 448] + - [43, 12422.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [58, 15226.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 1024] + - [52, 17819.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 448] + - [44, 19852.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 128] + - [34, 12686.0] + - - [448, 1024, 1, 256, 448, 448, 448, 1024] + - [39, 10393.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 5056] + - [53, 18515.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [47, 10253.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 5888] + - [53, 19859.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 128] + - [36, 17090.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 128] + - [51, 13132.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 256] + - [37, 19637.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 128] + - [41, 19156.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 3584] + - [46, 19865.0] + - - [128, 5888, 1, 256, 128, 128, 128, 5888] + - [56, 12811.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] + - [43, 14847.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 19753.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [40, 14187.0] + - - [704, 1024, 1, 256, 704, 704, 704, 1024] + - [43, 12816.0] + - - [704, 704, 1, 1280, 704, 704, 704, 704] + - [35, 14398.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 2368] + - [53, 13748.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 256] + - [46, 19831.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 128] + - [52, 18150.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 1856] + - [46, 18720.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 448] + - [43, 10302.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 128] + - [45, 13310.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 2944] + - [58, 18195.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 1024] + - [36, 18615.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 448] + - [35, 18420.0] + - - [128, 6784, 1, 256, 128, 128, 128, 6784] + - [37, 15480.0] + - - [704, 1408, 1, 256, 704, 704, 704, 1408] + - [44, 15781.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [42, 8358.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 2944] + - [35, 20443.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [51, 13396.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 2368] + - [36, 16558.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 2368] + - [34, 19797.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 448] + - [41, 14115.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 704] + - [53, 18699.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 4288] + - [53, 15873.0] + - - [448, 704, 1, 3328, 448, 448, 448, 704] + - [44, 16452.0] + - - [448, 1856, 1, 256, 448, 448, 448, 1856] + - [34, 13506.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 704] + - [52, 21368.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [51, 20870.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [34, 20655.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [38, 16696.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [34, 17461.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [43, 20593.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [34, 14664.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [68, 11344.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [95, 6974.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [74, 5729.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [102, 11077.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [109, 8933.0] + - - [704, 128, 1, 1280, 704, 704, 704, 128] + - [78, 6155.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [66, 11748.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 128] + - [86, 8840.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [78, 9966.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [107, 11496.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [60, 6820.0] + - - [448, 448, 1, 256, 448, 448, 448, 448] + - [91, 6375.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 1024] + - [72, 9811.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [112, 8590.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 128] + - [103, 8176.0] + - - [448, 256, 1, 3328, 448, 448, 448, 256] + - [88, 9233.0] + - - [128, 704, 1, 1280, 128, 128, 128, 704] + - [105, 6817.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 128] + - [83, 12277.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [100, 4813.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [60, 8604.0] + - - [448, 448, 1, 3328, 448, 448, 448, 448] + - [102, 12216.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 128] + - [109, 10720.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1856] + - [75, 11678.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [112, 7383.0] + - - [256, 448, 1, 3328, 256, 256, 256, 448] + - [81, 9697.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [115, 10125.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [66, 6712.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [111, 11555.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1024] + - [103, 8074.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [96, 4420.0] + - - [704, 128, 1, 256, 704, 704, 704, 128] + - [108, 3160.0] + - - [448, 256, 1, 1280, 448, 448, 448, 256] + - [66, 7630.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 128] + - [62, 10938.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [99, 8843.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [68, 3949.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1024] + - [114, 10525.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [78, 11301.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 128] + - [64, 11317.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [70, 11354.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [111, 5532.0] + - - [128, 1024, 1, 256, 128, 128, 128, 1024] + - [68, 4474.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [75, 5661.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [68, 6317.0] + - - [704, 128, 1, 3328, 704, 704, 704, 128] + - [88, 7293.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 128] + - [60, 5402.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [86, 9740.0] + - - [704, 256, 1, 1280, 704, 704, 704, 256] + - [84, 10030.0] + - - [256, 448, 1, 1280, 256, 256, 256, 448] + - [102, 7694.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [68, 10763.0] + - - [256, 704, 1, 3328, 256, 256, 256, 704] + - [84, 11291.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [66, 11013.0] + - - [128, 1408, 1, 256, 128, 128, 128, 1408] + - [91, 5532.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 1408] + - [86, 10612.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [65, 3231.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [103, 5078.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 128] + - [93, 9617.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [77, 9463.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [73, 7395.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [86, 10523.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [95, 7717.0] + - - [448, 448, 1, 1280, 448, 448, 448, 448] + - [66, 10426.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [86, 10866.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [115, 11809.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 1024] + - [102, 11357.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [66, 9553.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [108, 3894.0] + - - [128, 704, 1, 256, 128, 128, 128, 704] + - [92, 3373.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 128] + - [85, 5008.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [68, 10389.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [62, 6496.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [70, 9521.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [111, 11142.0] + - - [128, 704, 1, 3328, 128, 128, 128, 704] + - [112, 7475.0] + - - [128, 1856, 1, 256, 128, 128, 128, 1856] + - [62, 6833.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [98, 7125.0] + - - [704, 256, 1, 3328, 704, 704, 704, 256] + - [81, 11005.0] + - - [256, 704, 1, 1280, 256, 256, 256, 704] + - [93, 9644.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [90, 4076.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1408] + - [100, 9850.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 1856] + - [62, 11907.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [70, 6373.0] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [71, 3449.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [85, 5715.0] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [69, 4044.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [71, 417.0] + - - [128, 448, 1, 256, 128, 128, 128, 448] + - [102, 2659.0] + - - [256, 256, 1, 3328, 256, 256, 256, 256] + - [72, 6133.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [89, 2827.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [106, 539.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [89, 1662.0] + - - [128, 448, 1, 1280, 128, 128, 128, 448] + - [67, 4772.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [63, 4234.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [97, 1059.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [89, 3545.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [89, 1078.0] + - - [128, 128, 1, 3328, 128, 128, 128, 128] + - [106, 2148.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [67, 1407.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [89, 3550.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [65, 701.0] + - - [256, 128, 1, 1280, 256, 256, 256, 128] + - [110, 3507.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [71, 828.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [94, 3813.0] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [94, 1885.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [104, 5351.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [87, 718.0] + - - [448, 128, 1, 256, 448, 448, 448, 128] + - [61, 2669.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [94, 3938.0] + - - [448, 128, 1, 3328, 448, 448, 448, 128] + - [69, 6035.0] + - - [128, 256, 1, 1280, 128, 128, 128, 256] + - [89, 3905.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [89, 1649.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [106, 2173.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [110, 1215.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [76, 172.0] + - - [128, 256, 1, 3328, 128, 128, 128, 256] + - [71, 4042.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [82, 835.0] + - - [128, 128, 1, 1280, 128, 128, 128, 128] + - [71, 1651.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [76, 1403.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [79, 343.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [67, 1903.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [89, 1073.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [113, 1764.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [80, 1461.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [101, 2519.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [92, 348.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [89, 5078.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [105, 6242.0] + - - [448, 128, 1, 1280, 448, 448, 448, 128] + - [65, 5544.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [72, 6103.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [89, 2125.0] + - - [256, 256, 1, 1280, 256, 256, 256, 256] + - [68, 4900.0] + - - [256, 128, 1, 3328, 256, 256, 256, 128] + - [71, 4075.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [67, 697.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [69, 4962.0] + - - [128, 448, 1, 3328, 128, 128, 128, 448] + - [85, 5646.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [89, 2135.0] + - - [128, 128, 1, 256, 128, 128, 128, 128] + - [59, 788.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 000000000..d76a11d84 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,64545 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 4 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 4 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SN_SU32_SUM3_TT1_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] + - [1, 11945.0] + - - [1600, 1024, 1, 512, 1600, 1600, 1600, 1024] + - [0, 10421.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 1024] + - [4, 12365.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 1024] + - [1, 12293.0] + - - [3072, 768, 1, 4096, 3072, 3072, 3072, 768] + - [3, 11482.0] + - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] + - [15, 11927.0] + - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] + - [1, 11968.0] + - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] + - [15, 11627.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 4288] + - [4, 12256.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 5888] + - [15, 11950.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1024] + - [4, 12195.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 1856] + - [16, 12151.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 704] + - [31, 11142.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 2944] + - [17, 12768.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 4288] + - [0, 11325.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 5056] + - [25, 11418.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 5056] + - [29, 12492.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 5888] + - [17, 12316.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3584] + - [28, 11767.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1408] + - [4, 12307.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 2368] + - [25, 11328.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1856] + - [25, 11811.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 5056] + - [17, 12453.0] + - - [448, 5056, 1, 256, 448, 448, 448, 5056] + - [0, 9765.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 1408] + - [8, 10192.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 256] + - [28, 11379.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 4288] + - [4, 12520.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 448] + - [0, 10738.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 2368] + - [16, 11598.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 2944] + - [1, 12241.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 5056] + - [1, 11173.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 704] + - [16, 10924.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [30, 10312.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 4288] + - [4, 11870.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 1024] + - [25, 11715.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 5056] + - [6, 10648.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 2944] + - [30, 11664.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 5056] + - [4, 12671.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 5056] + - [26, 12048.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 6784] + - [25, 11649.0] + - - [704, 5056, 1, 128, 704, 704, 704, 5056] + - [8, 9970.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 2944] + - [1, 12193.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 6784] + - [17, 12775.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 4288] + - [26, 12140.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 4288] + - [17, 12448.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 704] + - [25, 10837.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 4288] + - [17, 12462.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 2368] + - [29, 12198.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 448] + - [25, 11643.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 2944] + - [30, 11332.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 2944] + - [1, 11933.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 704] + - [28, 11727.0] + - - [448, 5888, 1, 128, 448, 448, 448, 5888] + - [14, 9447.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 2368] + - [29, 12190.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 3584] + - [18, 9868.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 5888] + - [26, 12582.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 2944] + - [21, 11973.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 1408] + - [25, 10799.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 2368] + - [3, 11565.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 6784] + - [18, 11429.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 704] + - [10, 11869.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 1856] + - [25, 11385.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 5056] + - [15, 11836.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 1856] + - [14, 11921.0] + - - [704, 5888, 1, 256, 704, 704, 704, 5888] + - [5, 10754.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 6784] + - [17, 12795.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 704] + - [3, 11532.0] + - - [448, 4288, 1, 256, 448, 448, 448, 4288] + - [18, 9465.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 2368] + - [30, 10197.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 2368] + - [28, 11525.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 4288] + - [15, 11821.0] + - - [704, 2944, 1, 128, 704, 704, 704, 2944] + - [0, 9394.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1024] + - [3, 11304.0] + - - [704, 6784, 1, 256, 704, 704, 704, 6784] + - [12, 10822.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 704] + - [14, 11457.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 1408] + - [5, 11499.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 4288] + - [17, 12491.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1856] + - [16, 12121.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 1024] + - [1, 12417.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 4288] + - [14, 11263.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 3584] + - [1, 12142.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 6784] + - [15, 12247.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3584] + - [17, 12454.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 2944] + - [15, 12455.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 2368] + - [16, 12002.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 2368] + - [20, 11146.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 6784] + - [17, 12692.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 5888] + - [4, 12272.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 4288] + - [17, 12417.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 5888] + - [1, 12031.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 5888] + - [15, 11276.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 3584] + - [12, 11323.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 5888] + - [17, 12830.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 5056] + - [15, 12291.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 1024] + - [30, 11218.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 2368] + - [25, 11586.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 448] + - [25, 11301.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 5888] + - [26, 11349.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 2368] + - [29, 12249.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 2944] + - [15, 12045.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 1024] + - [1, 11451.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 5056] + - [29, 12505.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 1856] + - [16, 11499.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 2368] + - [25, 11250.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 4288] + - [17, 12333.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 4288] + - [0, 11768.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 448] + - [14, 10705.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 6784] + - [4, 11460.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 5888] + - [17, 12783.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1024] + - [1, 12312.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 5888] + - [31, 10221.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 2944] + - [25, 11646.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 5888] + - [29, 12621.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 5888] + - [9, 11885.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 3584] + - [25, 11493.0] + - - [448, 3584, 1, 128, 448, 448, 448, 3584] + - [14, 8828.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 2944] + - [17, 12729.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 5888] + - [5, 11574.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 5888] + - [17, 12399.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 704] + - [0, 9595.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 2944] + - [17, 12422.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 2368] + - [0, 11729.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 704] + - [8, 10936.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 1408] + - [17, 12503.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 1024] + - [17, 12247.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 2944] + - [17, 12799.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 5056] + - [29, 12614.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 1856] + - [0, 11084.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 5888] + - [8, 11440.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [17, 12628.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 5888] + - [29, 12478.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 4288] + - [1, 12005.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1856] + - [28, 11808.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 2944] + - [9, 11711.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 6784] + - [28, 11398.0] + - - [256, 5056, 1, 128, 256, 256, 256, 5056] + - [20, 10078.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 1024] + - [24, 11823.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 1856] + - [17, 12145.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 1408] + - [14, 10616.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 1408] + - [12, 11305.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 5056] + - [1, 12113.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 256] + - [12, 11896.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 5888] + - [17, 12215.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 2368] + - [14, 11841.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 3584] + - [15, 12242.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1024] + - [12, 11621.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 4288] + - [17, 12494.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1856] + - [28, 11409.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 2944] + - [26, 12093.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 5056] + - [17, 12395.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 1856] + - [25, 11761.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 1024] + - [0, 10170.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 3584] + - [5, 11810.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 5888] + - [17, 12811.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 1024] + - [1, 11737.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 2368] + - [25, 11726.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 5888] + - [17, 12664.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 1024] + - [25, 10414.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 448] + - [3, 11418.0] + - - [448, 5888, 1, 256, 448, 448, 448, 5888] + - [31, 9801.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 6784] + - [29, 12397.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 704] + - [0, 10728.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 2944] + - [9, 12212.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 5888] + - [17, 12292.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 1856] + - [14, 11197.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3584] + - [15, 12163.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 6784] + - [1, 11927.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1408] + - [1, 12423.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 4288] + - [25, 11748.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 1856] + - [25, 11113.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 5888] + - [4, 12389.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 6784] + - [29, 12582.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 2368] + - [0, 11561.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 4288] + - [25, 11155.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 2944] + - [1, 11758.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1856] + - [26, 12145.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 6784] + - [15, 12322.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 5056] + - [1, 12344.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 448] + - [25, 10817.0] + - - [448, 4288, 1, 128, 448, 448, 448, 4288] + - [25, 9101.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 4288] + - [17, 12313.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [14, 10750.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 4288] + - [31, 11565.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 704] + - [3, 11603.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 3584] + - [17, 12192.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 5056] + - [4, 12259.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 2368] + - [29, 12170.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 3584] + - [0, 11765.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 3584] + - [15, 12428.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 2368] + - [20, 11531.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 2944] + - [4, 12475.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 6784] + - [26, 12381.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 3584] + - [30, 10978.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 448] + - [25, 11342.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 6784] + - [4, 12070.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 1856] + - [14, 11798.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 1856] + - [25, 11139.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 6784] + - [15, 12364.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 3584] + - [17, 12485.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 5888] + - [17, 12226.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 5888] + - [18, 11935.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 2368] + - [29, 12270.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 4288] + - [15, 11916.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 3584] + - [26, 12108.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 6784] + - [29, 12301.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1408] + - [14, 11131.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 704] + - [25, 11308.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 5888] + - [26, 11995.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 2944] + - [5, 10978.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 2368] + - [0, 10932.0] + - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [4, 12709.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 704] + - [25, 11439.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 6784] + - [1, 12111.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 448] + - [28, 11541.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 2368] + - [36, 10369.0] + - - [256, 5888, 1, 128, 256, 256, 256, 5888] + - [25, 10018.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 2944] + - [1, 12422.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 2368] + - [4, 11954.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 704] + - [16, 11800.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 4288] + - [25, 11942.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 2944] + - [29, 12244.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 704] + - [25, 10652.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 5056] + - [14, 11796.0] + - - [448, 5056, 1, 128, 448, 448, 448, 5056] + - [8, 9043.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 5056] + - [0, 11599.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 3584] + - [15, 11754.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 2368] + - [14, 11888.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 5056] + - [4, 12626.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 5056] + - [0, 11364.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3584] + - [17, 12531.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 6784] + - [4, 12578.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 2944] + - [29, 12312.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 704] + - [3, 11515.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 5056] + - [18, 11641.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 5888] + - [17, 12778.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 5888] + - [17, 12807.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 4288] + - [17, 12480.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 3584] + - [1, 12040.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 1856] + - [20, 10681.0] + - - [704, 3584, 1, 128, 704, 704, 704, 3584] + - [25, 9937.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 448] + - [3, 11378.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 4288] + - [15, 11917.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 2944] + - [36, 11553.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 6784] + - [29, 12247.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 2944] + - [17, 12616.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 3584] + - [15, 12026.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 1408] + - [16, 11124.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 3584] + - [17, 11846.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 704] + - [14, 11133.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 1408] + - [4, 12170.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 3584] + - [0, 10734.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 6784] + - [29, 12765.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 6784] + - [1, 11842.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 448] + - [3, 11576.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 4288] + - [25, 11973.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 704] + - [28, 11735.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 1024] + - [17, 12276.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 6784] + - [15, 11382.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 5056] + - [29, 12145.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 3584] + - [14, 11007.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 1408] + - [0, 10118.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 2944] + - [5, 11341.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 2944] + - [5, 11736.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 5056] + - [17, 12668.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 2368] + - [14, 11028.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 2368] + - [25, 11873.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 6784] + - [17, 12671.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [17, 12520.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 5888] + - [1, 11708.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 4288] + - [4, 12385.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1408] + - [9, 12153.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 5056] + - [0, 11919.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 2368] + - [8, 11700.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 5056] + - [17, 12580.0] + - - [448, 6784, 1, 256, 448, 448, 448, 6784] + - [30, 9916.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 2368] + - [29, 12305.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 1856] + - [4, 11879.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 448] + - [14, 10124.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1024] + - [28, 11700.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 5056] + - [21, 11588.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 4288] + - [0, 11648.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3584] + - [17, 12704.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3584] + - [29, 12634.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1408] + - [0, 11562.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 2944] + - [26, 12548.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 6784] + - [0, 11737.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 1408] + - [18, 11318.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 6784] + - [17, 12672.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 4288] + - [1, 12056.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 6784] + - [17, 12267.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 4288] + - [1, 12197.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 1408] + - [18, 11744.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 1024] + - [14, 11394.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 5888] + - [29, 12542.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1024] + - [9, 11907.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 6784] + - [26, 10539.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1408] + - [1, 12125.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 1856] + - [3, 11924.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 2944] + - [14, 11715.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 5888] + - [17, 12664.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1856] + - [4, 12214.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 5056] + - [1, 12187.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 5888] + - [17, 12739.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 256] + - [12, 11204.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 5888] + - [4, 12149.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 1408] + - [0, 11653.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3584] + - [3, 10856.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 448] + - [0, 11638.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 1856] + - [17, 12232.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 1024] + - [0, 11201.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 2368] + - [20, 10960.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 4288] + - [4, 11981.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1408] + - [25, 11260.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 5056] + - [4, 12373.0] + - - [448, 6784, 1, 128, 448, 448, 448, 6784] + - [14, 9325.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 6784] + - [17, 12513.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 2368] + - [16, 11521.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 3584] + - [15, 12163.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1408] + - [26, 12108.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 448] + - [14, 9613.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 256] + - [5, 11773.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 1408] + - [28, 11260.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 4288] + - [15, 12024.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 448] + - [0, 11096.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 2368] + - [4, 12255.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1856] + - [30, 11531.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 1856] + - [25, 11965.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 5888] + - [1, 11810.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 2368] + - [0, 11344.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 1408] + - [25, 10739.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 2368] + - [8, 11073.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 1408] + - [26, 11731.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 5888] + - [26, 12225.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 5056] + - [15, 11873.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 3584] + - [1, 11757.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 5056] + - [4, 12579.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 1024] + - [0, 9754.0] + - - [704, 4288, 1, 256, 704, 704, 704, 4288] + - [14, 10203.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 2368] + - [4, 12312.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 5888] + - [15, 12304.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 256] + - [30, 11080.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 1856] + - [0, 11627.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 704] + - [25, 11170.0] + - - [704, 3584, 1, 256, 704, 704, 704, 3584] + - [25, 10442.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 2944] + - [5, 11107.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 1024] + - [25, 11620.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 1024] + - [28, 11694.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 5056] + - [0, 11871.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 6784] + - [1, 12056.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 1408] + - [29, 12396.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 6784] + - [15, 11943.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 2944] + - [17, 12766.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 1856] + - [0, 11377.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 2944] + - [14, 11070.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 448] + - [14, 11046.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 5056] + - [15, 10529.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 1856] + - [0, 11031.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 704] + - [0, 11116.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 704] + - [25, 11470.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 1024] + - [14, 11121.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 5888] + - [5, 11214.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 4288] + - [29, 12256.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 4288] + - [1, 12235.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [25, 10230.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1024] + - [15, 12181.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 1024] + - [25, 11570.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 3584] + - [17, 12660.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 6784] + - [15, 12175.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 2944] + - [15, 12129.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 2368] + - [28, 11708.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 1856] + - [31, 11372.0] + - - [256, 6784, 1, 128, 256, 256, 256, 6784] + - [25, 10447.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 6784] + - [32, 11987.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 5056] + - [14, 11696.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 5888] + - [18, 11423.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 5888] + - [17, 12475.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 1856] + - [25, 11759.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 3584] + - [15, 12443.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 4288] + - [3, 10584.0] + - - [704, 5888, 1, 128, 704, 704, 704, 5888] + - [31, 10251.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 3584] + - [1, 12132.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 5056] + - [17, 12464.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 1408] + - [25, 10126.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 2368] + - [14, 11828.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 704] + - [3, 11655.0] + - - [448, 3584, 1, 256, 448, 448, 448, 3584] + - [18, 9363.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1024] + - [1, 11333.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 1408] + - [29, 12251.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 1408] + - [16, 11442.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [17, 12721.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 3584] + - [26, 12392.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 1856] + - [16, 11941.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1408] + - [29, 12338.0] + - - [704, 2944, 1, 256, 704, 704, 704, 2944] + - [0, 10271.0] + - - [704, 4288, 1, 128, 704, 704, 704, 4288] + - [0, 9908.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 4288] + - [0, 11385.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 6784] + - [31, 11503.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1408] + - [14, 11027.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 4288] + - [15, 10311.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 1408] + - [0, 11213.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 5056] + - [25, 11984.0] + - - [704, 2368, 1, 256, 704, 704, 704, 2368] + - [0, 9467.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 2368] + - [17, 12358.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 448] + - [8, 11367.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 704] + - [16, 11780.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 256] + - [8, 9489.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 5888] + - [0, 11669.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 1024] + - [25, 10837.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 1856] + - [14, 10568.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 6784] + - [29, 12635.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 5056] + - [15, 11322.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 5056] + - [17, 12630.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 2944] + - [1, 12257.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 3584] + - [26, 11719.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 3584] + - [15, 12579.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 2944] + - [26, 11628.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 2368] + - [1, 12125.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 1408] + - [18, 11689.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 1408] + - [4, 12254.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 1024] + - [14, 10833.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 5056] + - [17, 12420.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 6784] + - [17, 12800.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 5056] + - [1, 12018.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 1408] + - [17, 12390.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [14, 10758.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3584] + - [6, 10018.0] + - - [704, 2368, 1, 128, 704, 704, 704, 2368] + - [25, 9158.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 256] + - [18, 9986.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 1856] + - [0, 11594.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 4288] + - [0, 11727.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 1024] + - [6, 11815.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 5056] + - [18, 11657.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 1408] + - [10, 11665.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 448] + - [25, 11009.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 6784] + - [1, 12268.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 5056] + - [4, 12641.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 704] + - [0, 11768.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 6784] + - [29, 12493.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 1408] + - [1, 11961.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 5888] + - [23, 12074.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 5888] + - [26, 12370.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 1024] + - [0, 10622.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 1856] + - [0, 11639.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 6784] + - [15, 12219.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 3584] + - [26, 12114.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1856] + - [1, 12065.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 1024] + - [16, 11486.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 3584] + - [17, 12673.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3584] + - [26, 12241.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 2944] + - [25, 11273.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 6784] + - [1, 10425.0] + - - [704, 5056, 1, 256, 704, 704, 704, 5056] + - [14, 10549.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 1024] + - [28, 11783.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1856] + - [16, 11851.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [12, 10886.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3584] + - [17, 12191.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 704] + - [14, 11471.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 3584] + - [17, 12417.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 5888] + - [21, 12094.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 448] + - [3, 11746.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 1408] + - [1, 11720.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 1408] + - [36, 11587.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 2368] + - [16, 11974.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 2368] + - [1, 11956.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 1856] + - [0, 11545.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 2944] + - [0, 11243.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 1024] + - [12, 11410.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [29, 12195.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 256] + - [25, 10339.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 704] + - [0, 11161.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 4288] + - [28, 11952.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 5056] + - [17, 12265.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 1024] + - [1, 12021.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 5056] + - [15, 12418.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 1856] + - [30, 11760.0] + - - [704, 6784, 1, 128, 704, 704, 704, 6784] + - [25, 10439.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 6784] + - [26, 12246.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 2944] + - [17, 12458.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 2944] + - [15, 12395.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 6784] + - [15, 11632.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 4288] + - [15, 10158.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 4288] + - [4, 12074.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 3584] + - [14, 11339.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 5056] + - [8, 11251.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 6784] + - [17, 12803.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 5888] + - [14, 10105.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 448] + - [0, 10613.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 2944] + - [23, 11730.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 256] + - [25, 11309.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 5888] + - [17, 12354.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1856] + - [14, 11515.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 4288] + - [25, 11827.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 4288] + - [17, 12513.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 704] + - [25, 10259.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 1408] + - [25, 11465.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 5056] + - [17, 12087.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1024] + - [0, 11550.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 5888] + - [17, 12702.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 4288] + - [0, 11593.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 2368] + - [3, 11808.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 1856] + - [4, 12265.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 2944] + - [14, 10737.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 3584] + - [14, 11685.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 5888] + - [29, 12508.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 2944] + - [3, 11656.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 6784] + - [15, 12416.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 6784] + - [25, 11345.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 3584] + - [30, 11709.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 1856] + - [25, 11896.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 704] + - [25, 10855.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 5888] + - [18, 11149.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 6784] + - [4, 12526.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [17, 12776.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 1408] + - [0, 10465.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 1024] + - [25, 10207.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3584] + - [17, 12691.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 5056] + - [15, 12202.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 2368] + - [25, 11598.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 4288] + - [31, 11767.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 2944] + - [1, 12088.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [14, 10881.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 5056] + - [5, 11852.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 1856] + - [8, 11565.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 1408] + - [9, 11978.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3584] + - [4, 12494.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 2368] + - [25, 11664.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 6784] + - [4, 12831.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 4288] + - [31, 10552.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 448] + - [3, 11734.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 1024] + - [0, 10842.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [4, 9549.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [4, 10718.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [29, 10870.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [34, 8330.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [4, 11013.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [17, 9415.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 9104.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [4, 10606.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [17, 9296.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [17, 9386.0] + - - [5329, 160, 64, 64, 5329, 5329, 5329, 160] + - [28, 7084.0] + - - [1225, 384, 64, 192, 1225, 1225, 1225, 384] + - [1, 11562.0] + - - [289, 1024, 64, 256, 289, 289, 289, 1024] + - [26, 9210.0] + - - [1225, 384, 64, 64, 1225, 1225, 1225, 384] + - [0, 10346.0] + - - [1225, 384, 64, 96, 1225, 1225, 1225, 384] + - [3, 10381.0] + - - [289, 1024, 64, 384, 289, 289, 289, 1024] + - [26, 9357.0] + - - [289, 1024, 64, 192, 289, 289, 289, 1024] + - [30, 9088.0] + - - [289, 1024, 64, 128, 289, 289, 289, 1024] + - [14, 8858.0] + - - [4096, 1024, 1, 2984, 4096, 4096, 4096, 1024] + - [15, 12357.0] + - - [1024, 4096, 1, 3437, 1024, 1024, 1024, 4096] + - [23, 12366.0] + - - [1024, 4096, 1, 3235, 1024, 1024, 1024, 4096] + - [29, 12376.0] + - - [4096, 1024, 1, 4032, 4096, 4096, 4096, 1024] + - [17, 12380.0] + - - [1024, 4096, 1, 3334, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [4096, 1024, 1, 3288, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3515, 1024, 1024, 1024, 4096] + - [4, 12365.0] + - - [4096, 1024, 1, 3437, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3259, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [1024, 4096, 1, 3384, 1024, 1024, 1024, 4096] + - [11, 12371.0] + - - [4096, 1024, 1, 3458, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3412, 1024, 1024, 1024, 4096] + - [4, 12365.0] + - - [1024, 4096, 1, 3529, 1024, 1024, 1024, 4096] + - [23, 12368.0] + - - [1024, 4096, 1, 4032, 1024, 1024, 1024, 4096] + - [17, 12380.0] + - - [4096, 1024, 1, 3999, 4096, 4096, 4096, 1024] + - [17, 12375.0] + - - [1024, 4096, 1, 3079, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [1024, 4096, 1, 3876, 1024, 1024, 1024, 4096] + - [4, 12365.0] + - - [1024, 4096, 1, 3450, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [1024, 4096, 1, 3256, 1024, 1024, 1024, 4096] + - [1, 12359.0] + - - [4096, 1024, 1, 3403, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3359, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3549, 4096, 4096, 4096, 1024] + - [4, 12355.0] + - - [4096, 1024, 1, 3176, 4096, 4096, 4096, 1024] + - [15, 12353.0] + - - [1024, 4096, 1, 3504, 1024, 1024, 1024, 4096] + - [15, 12366.0] + - - [4096, 1024, 1, 3314, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [4096, 1024, 1, 3183, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 4096, 1, 3209, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [1024, 4096, 1, 3720, 1024, 1024, 1024, 4096] + - [15, 12367.0] + - - [1024, 4096, 1, 3859, 1024, 1024, 1024, 4096] + - [4, 12368.0] + - - [1024, 33708, 1, 4059, 1024, 1024, 1024, 33708] + - [4, 12746.0] + - - [4096, 1024, 1, 3477, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [4096, 1024, 1, 3233, 4096, 4096, 4096, 1024] + - [29, 12371.0] + - - [4096, 1024, 1, 3409, 4096, 4096, 4096, 1024] + - [35, 12363.0] + - - [4096, 1024, 1, 3564, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [4096, 1024, 1, 3190, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3288, 1024, 1024, 1024, 4096] + - [15, 12369.0] + - - [4096, 1024, 1, 3451, 4096, 4096, 4096, 1024] + - [4, 12357.0] + - - [1024, 4096, 1, 3348, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [1024, 4096, 1, 3465, 1024, 1024, 1024, 4096] + - [23, 12361.0] + - - [1024, 33708, 1, 4032, 1024, 1024, 1024, 33708] + - [29, 12753.0] + - - [1024, 33708, 1, 3840, 1024, 1024, 1024, 33708] + - [17, 12749.0] + - - [4096, 1024, 1, 3391, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3530, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3209, 4096, 4096, 4096, 1024] + - [17, 12352.0] + - - [1024, 4096, 1, 3457, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [1024, 4096, 1, 3386, 1024, 1024, 1024, 4096] + - [4, 12374.0] + - - [4096, 1024, 1, 3350, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3184, 1024, 1024, 1024, 4096] + - [15, 12361.0] + - - [1024, 4096, 1, 3093, 1024, 1024, 1024, 4096] + - [17, 12374.0] + - - [1024, 4096, 1, 3400, 1024, 1024, 1024, 4096] + - [29, 12371.0] + - - [1024, 4096, 1, 3214, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [4096, 1024, 1, 3406, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3565, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [4096, 1024, 1, 3536, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [1024, 4096, 1, 3183, 1024, 1024, 1024, 4096] + - [17, 12352.0] + - - [1024, 4096, 1, 3462, 1024, 1024, 1024, 4096] + - [15, 12368.0] + - - [4096, 1024, 1, 3130, 4096, 4096, 4096, 1024] + - [17, 12351.0] + - - [4096, 1024, 1, 3381, 4096, 4096, 4096, 1024] + - [23, 12371.0] + - - [4096, 1024, 1, 3298, 4096, 4096, 4096, 1024] + - [29, 12378.0] + - - [1024, 4096, 1, 3292, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [4096, 1024, 1, 3289, 4096, 4096, 4096, 1024] + - [15, 12353.0] + - - [1024, 4096, 1, 3379, 1024, 1024, 1024, 4096] + - [15, 12359.0] + - - [1024, 4096, 1, 3990, 1024, 1024, 1024, 4096] + - [17, 12379.0] + - - [1024, 4096, 1, 3540, 1024, 1024, 1024, 4096] + - [15, 12367.0] + - - [4096, 1024, 1, 3412, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3555, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 4096, 1, 3518, 1024, 1024, 1024, 4096] + - [15, 12372.0] + - - [4096, 1024, 1, 3189, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 4096, 1, 3298, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3393, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [1024, 4096, 1, 3207, 1024, 1024, 1024, 4096] + - [1, 12357.0] + - - [4096, 1024, 1, 3487, 4096, 4096, 4096, 1024] + - [4, 12358.0] + - - [4096, 1024, 1, 3431, 4096, 4096, 4096, 1024] + - [4, 12358.0] + - - [4096, 1024, 1, 3378, 4096, 4096, 4096, 1024] + - [15, 12360.0] + - - [4096, 1024, 1, 3529, 4096, 4096, 4096, 1024] + - [17, 12382.0] + - - [4096, 1024, 1, 3460, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3336, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [1024, 4096, 1, 3501, 1024, 1024, 1024, 4096] + - [4, 12356.0] + - - [1024, 4096, 1, 3584, 1024, 1024, 1024, 4096] + - [17, 12369.0] + - - [4096, 1024, 1, 2499, 4096, 4096, 4096, 1024] + - [17, 12341.0] + - - [4096, 1024, 1, 3352, 4096, 4096, 4096, 1024] + - [4, 12367.0] + - - [1024, 4096, 1, 3543, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [1024, 4096, 1, 3476, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [1024, 33708, 1, 3822, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [1024, 4096, 1, 3436, 1024, 1024, 1024, 4096] + - [15, 12359.0] + - - [1024, 4096, 1, 3594, 1024, 1024, 1024, 4096] + - [4, 12371.0] + - - [4096, 1024, 1, 3514, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [1024, 4096, 1, 3064, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [4096, 1024, 1, 3371, 4096, 4096, 4096, 1024] + - [4, 12365.0] + - - [4096, 1024, 1, 3558, 4096, 4096, 4096, 1024] + - [11, 12363.0] + - - [4096, 1024, 1, 3517, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [4096, 1024, 1, 3144, 4096, 4096, 4096, 1024] + - [17, 12378.0] + - - [1024, 4096, 1, 3312, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [4096, 1024, 1, 3079, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3415, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [1024, 4096, 1, 3221, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [1024, 4096, 1, 3978, 1024, 1024, 1024, 4096] + - [4, 12380.0] + - - [4096, 1024, 1, 3876, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [1024, 4096, 1, 3528, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3181, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [4096, 1024, 1, 3445, 4096, 4096, 4096, 1024] + - [29, 12357.0] + - - [4096, 1024, 1, 3450, 4096, 4096, 4096, 1024] + - [17, 12380.0] + - - [4096, 1024, 1, 3377, 4096, 4096, 4096, 1024] + - [15, 12350.0] + - - [1024, 4096, 1, 3532, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [1024, 33708, 1, 3944, 1024, 1024, 1024, 33708] + - [17, 12749.0] + - - [4096, 1024, 1, 3483, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [1024, 4096, 1, 3358, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [4096, 1024, 1, 3464, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3282, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [4096, 1024, 1, 3256, 4096, 4096, 4096, 1024] + - [1, 12365.0] + - - [1024, 4096, 1, 3057, 1024, 1024, 1024, 4096] + - [15, 12351.0] + - - [4096, 1024, 1, 3481, 4096, 4096, 4096, 1024] + - [15, 12359.0] + - - [4096, 1024, 1, 3340, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 4096, 1, 3273, 1024, 1024, 1024, 4096] + - [15, 12357.0] + - - [4096, 1024, 1, 3392, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [4096, 1024, 1, 3337, 4096, 4096, 4096, 1024] + - [1, 12352.0] + - - [4096, 1024, 1, 3359, 4096, 4096, 4096, 1024] + - [1, 12354.0] + - - [4096, 1024, 1, 3498, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 3169, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 33708, 1, 3859, 1024, 1024, 1024, 33708] + - [4, 12746.0] + - - [1024, 4096, 1, 3103, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [4096, 1024, 1, 3900, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [1024, 4096, 1, 3442, 1024, 1024, 1024, 4096] + - [15, 12365.0] + - - [1024, 4096, 1, 3248, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [1024, 4096, 1, 3351, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3593, 4096, 4096, 4096, 1024] + - [29, 12359.0] + - - [1024, 4096, 1, 3780, 1024, 1024, 1024, 4096] + - [17, 12367.0] + - - [1024, 33708, 1, 3681, 1024, 1024, 1024, 33708] + - [4, 12747.0] + - - [4096, 1024, 1, 3374, 4096, 4096, 4096, 1024] + - [4, 12356.0] + - - [1024, 4096, 1, 3557, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3906, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [4096, 1024, 1, 3504, 4096, 4096, 4096, 1024] + - [4, 12362.0] + - - [1024, 4096, 1, 3270, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [4096, 1024, 1, 3098, 4096, 4096, 4096, 1024] + - [4, 12351.0] + - - [4096, 1024, 1, 3216, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [1024, 4096, 1, 3550, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [4096, 1024, 1, 3449, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3403, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3523, 1024, 1024, 1024, 4096] + - [17, 12384.0] + - - [1024, 4096, 1, 3486, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [1024, 4096, 1, 3564, 1024, 1024, 1024, 4096] + - [17, 12367.0] + - - [1024, 33708, 1, 4005, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [4096, 1024, 1, 3296, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3263, 1024, 1024, 1024, 4096] + - [17, 12351.0] + - - [1024, 4096, 1, 3130, 1024, 1024, 1024, 4096] + - [17, 12350.0] + - - [1024, 4096, 1, 3295, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [1024, 33708, 1, 3925, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [1024, 4096, 1, 3378, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [4096, 1024, 1, 3720, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [4096, 1024, 1, 3399, 4096, 4096, 4096, 1024] + - [15, 12352.0] + - - [4096, 1024, 1, 3543, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [4096, 1024, 1, 3497, 4096, 4096, 4096, 1024] + - [17, 12382.0] + - - [4096, 1024, 1, 3594, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3144, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 3975, 1024, 1024, 1024, 4096] + - [17, 12376.0] + - - [4096, 1024, 1, 3205, 4096, 4096, 4096, 1024] + - [17, 12353.0] + - - [1024, 33708, 1, 3995, 1024, 1024, 1024, 33708] + - [4, 12746.0] + - - [1024, 4096, 1, 3392, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 3055, 1024, 1024, 1024, 4096] + - [17, 12349.0] + - - [1024, 4096, 1, 4026, 1024, 1024, 1024, 4096] + - [17, 12377.0] + - - [4096, 1024, 1, 3557, 4096, 4096, 4096, 1024] + - [17, 12368.0] + - - [4096, 1024, 1, 3515, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3486, 4096, 4096, 4096, 1024] + - [1, 12360.0] + - - [4096, 1024, 1, 3457, 4096, 4096, 4096, 1024] + - [4, 12367.0] + - - [1024, 4096, 1, 3511, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [4096, 1024, 1, 3138, 4096, 4096, 4096, 1024] + - [11, 12356.0] + - - [1024, 4096, 1, 3339, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [1024, 4096, 1, 3939, 1024, 1024, 1024, 4096] + - [17, 12375.0] + - - [4096, 1024, 1, 3500, 4096, 4096, 4096, 1024] + - [1, 12353.0] + - - [4096, 1024, 1, 3395, 4096, 4096, 4096, 1024] + - [23, 12366.0] + - - [4096, 1024, 1, 4020, 4096, 4096, 4096, 1024] + - [17, 12374.0] + - - [4096, 1024, 1, 3942, 4096, 4096, 4096, 1024] + - [17, 12369.0] + - - [4096, 1024, 1, 3349, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3322, 1024, 1024, 1024, 4096] + - [17, 12352.0] + - - [4096, 1024, 1, 3452, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3417, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [1024, 4096, 1, 3526, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3485, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3303, 4096, 4096, 4096, 1024] + - [4, 12368.0] + - - [4096, 1024, 1, 3344, 4096, 4096, 4096, 1024] + - [4, 12356.0] + - - [1024, 4096, 1, 3479, 1024, 1024, 1024, 4096] + - [15, 12359.0] + - - [4096, 1024, 1, 3300, 4096, 4096, 4096, 1024] + - [1, 12349.0] + - - [1024, 4096, 1, 3439, 1024, 1024, 1024, 4096] + - [4, 12355.0] + - - [4096, 1024, 1, 3280, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [1024, 4096, 1, 3245, 1024, 1024, 1024, 4096] + - [15, 12352.0] + - - [1024, 4096, 1, 3328, 1024, 1024, 1024, 4096] + - [4, 12363.0] + - - [4096, 1024, 1, 3418, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 4096, 1, 3493, 1024, 1024, 1024, 4096] + - [15, 12357.0] + - - [1024, 4096, 1, 3500, 1024, 1024, 1024, 4096] + - [11, 12370.0] + - - [1024, 4096, 1, 3166, 1024, 1024, 1024, 4096] + - [15, 12367.0] + - - [4096, 1024, 1, 3126, 4096, 4096, 4096, 1024] + - [17, 12352.0] + - - [1024, 4096, 1, 3277, 1024, 1024, 1024, 4096] + - [4, 12353.0] + - - [1024, 4096, 1, 3315, 1024, 1024, 1024, 4096] + - [15, 12354.0] + - - [1024, 4096, 1, 3414, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3531, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 3484, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3180, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [4096, 1024, 1, 3360, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [1024, 33708, 1, 3990, 1024, 1024, 1024, 33708] + - [29, 12750.0] + - - [4096, 1024, 1, 3466, 4096, 4096, 4096, 1024] + - [29, 12357.0] + - - [1024, 4096, 1, 3428, 1024, 1024, 1024, 4096] + - [15, 12361.0] + - - [1024, 4096, 1, 3137, 1024, 1024, 1024, 4096] + - [23, 12357.0] + - - [4096, 1024, 1, 4059, 4096, 4096, 4096, 1024] + - [4, 12369.0] + - - [1024, 4096, 1, 3353, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 4096, 1, 3942, 1024, 1024, 1024, 4096] + - [17, 12373.0] + - - [4096, 1024, 1, 3506, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3508, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3956, 4096, 4096, 4096, 1024] + - [17, 12371.0] + - - [1024, 4096, 1, 3272, 1024, 1024, 1024, 4096] + - [17, 12365.0] + - - [1024, 4096, 1, 3443, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3375, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [1024, 4096, 1, 3525, 1024, 1024, 1024, 4096] + - [4, 12375.0] + - - [4096, 1024, 1, 3472, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [1024, 4096, 1, 3520, 1024, 1024, 1024, 4096] + - [17, 12366.0] + - - [4096, 1024, 1, 3322, 4096, 4096, 4096, 1024] + - [29, 12356.0] + - - [4096, 1024, 1, 3387, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 33708, 1, 3939, 1024, 1024, 1024, 33708] + - [4, 12750.0] + - - [4096, 1024, 1, 3345, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 2967, 4096, 4096, 4096, 1024] + - [29, 12366.0] + - - [1024, 4096, 1, 3453, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [1024, 4096, 1, 3640, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [4096, 1024, 1, 3291, 4096, 4096, 4096, 1024] + - [4, 12358.0] + - - [1024, 4096, 1, 3350, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [4096, 1024, 1, 3417, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3467, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3491, 1024, 1024, 1024, 4096] + - [23, 12366.0] + - - [1024, 4096, 1, 3822, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [4096, 1024, 1, 3292, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3231, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [1024, 4096, 1, 3364, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [1024, 4096, 1, 3995, 1024, 1024, 1024, 4096] + - [17, 12369.0] + - - [1024, 4096, 1, 3545, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [1024, 4096, 1, 3186, 1024, 1024, 1024, 4096] + - [4, 12352.0] + - - [4096, 1024, 1, 3432, 4096, 4096, 4096, 1024] + - [1, 12358.0] + - - [4096, 1024, 1, 3367, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [4096, 1024, 1, 3503, 4096, 4096, 4096, 1024] + - [17, 12383.0] + - - [1024, 4096, 1, 3095, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [4096, 1024, 1, 3465, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3402, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3140, 4096, 4096, 4096, 1024] + - [17, 12353.0] + - - [4096, 1024, 1, 3424, 4096, 4096, 4096, 1024] + - [1, 12367.0] + - - [4096, 1024, 1, 3257, 4096, 4096, 4096, 1024] + - [4, 12373.0] + - - [4096, 1024, 1, 2917, 4096, 4096, 4096, 1024] + - [15, 12362.0] + - - [1024, 33708, 1, 3640, 1024, 1024, 1024, 33708] + - [4, 12747.0] + - - [1024, 4096, 1, 3456, 1024, 1024, 1024, 4096] + - [17, 12367.0] + - - [1024, 4096, 1, 3014, 1024, 1024, 1024, 4096] + - [11, 12363.0] + - - [4096, 1024, 1, 3372, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3294, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [4096, 1024, 1, 3446, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3389, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [4096, 1024, 1, 3259, 4096, 4096, 4096, 1024] + - [15, 12355.0] + - - [4096, 1024, 1, 3544, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [4096, 1024, 1, 3479, 4096, 4096, 4096, 1024] + - [4, 12373.0] + - - [4096, 1024, 1, 3542, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3321, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3147, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [1024, 4096, 1, 3944, 1024, 1024, 1024, 4096] + - [4, 12366.0] + - - [4096, 1024, 1, 3870, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [1024, 4096, 1, 3308, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [4096, 1024, 1, 3401, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3395, 1024, 1024, 1024, 4096] + - [4, 12366.0] + - - [1024, 4096, 1, 3563, 1024, 1024, 1024, 4096] + - [17, 12363.0] + - - [1024, 33708, 1, 3870, 1024, 1024, 1024, 33708] + - [17, 12750.0] + - - [4096, 1024, 1, 3494, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3271, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [1024, 33708, 1, 3910, 1024, 1024, 1024, 33708] + - [17, 12750.0] + - - [1024, 4096, 1, 3287, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [1024, 33708, 1, 3860, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [4096, 1024, 1, 3341, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3136, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [4096, 1024, 1, 3439, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3751, 1024, 1024, 1024, 4096] + - [17, 12366.0] + - - [1024, 4096, 1, 3301, 1024, 1024, 1024, 4096] + - [15, 12362.0] + - - [4096, 1024, 1, 3468, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3416, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [4096, 1024, 1, 3163, 4096, 4096, 4096, 1024] + - [29, 12354.0] + - - [1024, 4096, 1, 3230, 1024, 1024, 1024, 4096] + - [17, 12350.0] + - - [1024, 4096, 1, 3581, 1024, 1024, 1024, 4096] + - [23, 12365.0] + - - [4096, 1024, 1, 3463, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3478, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [4096, 1024, 1, 3262, 4096, 4096, 4096, 1024] + - [17, 12350.0] + - - [1024, 4096, 1, 3438, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [1024, 4096, 1, 3244, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [1024, 4096, 1, 3445, 1024, 1024, 1024, 4096] + - [4, 12376.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [1024, 4096, 1, 3492, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3211, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3910, 1024, 1024, 1024, 4096] + - [4, 12367.0] + - - [1024, 4096, 1, 3314, 1024, 1024, 1024, 4096] + - [29, 12372.0] + - - [4096, 1024, 1, 3859, 4096, 4096, 4096, 1024] + - [4, 12371.0] + - - [4096, 1024, 1, 3383, 4096, 4096, 4096, 1024] + - [29, 12358.0] + - - [1024, 4096, 1, 3409, 1024, 1024, 1024, 4096] + - [15, 12362.0] + - - [1024, 4096, 1, 4020, 1024, 1024, 1024, 4096] + - [4, 12371.0] + - - [4096, 1024, 1, 3530, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 3411, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 4096, 1, 3566, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3493, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [4096, 1024, 1, 3184, 4096, 4096, 4096, 1024] + - [11, 12361.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [1024, 4096, 1, 3431, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [4096, 1024, 1, 3306, 4096, 4096, 4096, 1024] + - [17, 12352.0] + - - [1024, 4096, 1, 3352, 1024, 1024, 1024, 4096] + - [15, 12362.0] + - - [4096, 1024, 1, 3295, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3517, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3426, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 3385, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3572, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3459, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3374, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [4096, 1024, 1, 3166, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [4096, 1024, 1, 3093, 4096, 4096, 4096, 1024] + - [4, 12352.0] + - - [4096, 1024, 1, 3523, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [4096, 1024, 1, 3413, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3996, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [1024, 4096, 1, 3452, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3232, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 3400, 4096, 4096, 4096, 1024] + - [4, 12371.0] + - - [4096, 1024, 1, 3334, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3345, 1024, 1024, 1024, 4096] + - [15, 12351.0] + - - [1024, 4096, 1, 3538, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [1024, 4096, 1, 3466, 1024, 1024, 1024, 4096] + - [23, 12364.0] + - - [4096, 1024, 1, 3315, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3214, 4096, 4096, 4096, 1024] + - [29, 12364.0] + - - [1024, 33708, 1, 3900, 1024, 1024, 1024, 33708] + - [4, 12744.0] + - - [1024, 4096, 1, 3367, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [1024, 4096, 1, 2917, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [1024, 4096, 1, 3544, 1024, 1024, 1024, 4096] + - [11, 12363.0] + - - [4096, 1024, 1, 3414, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3565, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3512, 1024, 1024, 1024, 4096] + - [15, 12361.0] + - - [1024, 4096, 1, 3191, 1024, 1024, 1024, 4096] + - [23, 12358.0] + - - [1024, 4096, 1, 3289, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3290, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [1024, 4096, 1, 3211, 1024, 1024, 1024, 4096] + - [17, 12353.0] + - - [1024, 33708, 1, 3969, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [4096, 1024, 1, 3566, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3459, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [1024, 4096, 1, 3372, 1024, 1024, 1024, 4096] + - [11, 12361.0] + - - [4096, 1024, 1, 3339, 4096, 4096, 4096, 1024] + - [29, 12351.0] + - - [4096, 1024, 1, 3425, 4096, 4096, 4096, 1024] + - [17, 12386.0] + - - [4096, 1024, 1, 3388, 4096, 4096, 4096, 1024] + - [23, 12370.0] + - - [1024, 4096, 1, 3531, 1024, 1024, 1024, 4096] + - [15, 12365.0] + - - [4096, 1024, 1, 3286, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3462, 4096, 4096, 4096, 1024] + - [29, 12355.0] + - - [1024, 4096, 1, 3388, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [4096, 1024, 1, 3165, 4096, 4096, 4096, 1024] + - [17, 12350.0] + - - [4096, 1024, 1, 3304, 4096, 4096, 4096, 1024] + - [4, 12367.0] + - - [1024, 4096, 1, 2736, 1024, 1024, 1024, 4096] + - [15, 12357.0] + - - [4096, 1024, 1, 3397, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3311, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [1024, 4096, 1, 3394, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [4096, 1024, 1, 2736, 4096, 4096, 4096, 1024] + - [17, 12349.0] + - - [1024, 4096, 1, 3559, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [4096, 1024, 1, 3180, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 4096, 1, 3480, 1024, 1024, 1024, 4096] + - [15, 12371.0] + - - [4096, 1024, 1, 3318, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [4096, 1024, 1, 3213, 4096, 4096, 4096, 1024] + - [23, 12357.0] + - - [1024, 4096, 1, 3286, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [4096, 1024, 1, 3471, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 4096, 1, 3381, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [4096, 1024, 1, 3502, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3552, 1024, 1024, 1024, 4096] + - [15, 12368.0] + - - [4096, 1024, 1, 3519, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3300, 1024, 1024, 1024, 4096] + - [4, 12359.0] + - - [1024, 4096, 1, 3419, 1024, 1024, 1024, 4096] + - [15, 12373.0] + - - [4096, 1024, 1, 4030, 4096, 4096, 4096, 1024] + - [29, 12363.0] + - - [4096, 1024, 1, 3976, 4096, 4096, 4096, 1024] + - [17, 12371.0] + - - [1024, 4096, 1, 3473, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [4096, 1024, 1, 3428, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3433, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [4096, 1024, 1, 3534, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [4096, 1024, 1, 3461, 4096, 4096, 4096, 1024] + - [17, 12353.0] + - - [4096, 1024, 1, 3681, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3495, 4096, 4096, 4096, 1024] + - [29, 12369.0] + - - [4096, 1024, 1, 3351, 4096, 4096, 4096, 1024] + - [17, 12352.0] + - - [1024, 4096, 1, 4059, 1024, 1024, 1024, 4096] + - [4, 12370.0] + - - [4096, 1024, 1, 3990, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [1024, 4096, 1, 3325, 1024, 1024, 1024, 4096] + - [4, 12359.0] + - - [1024, 4096, 1, 3408, 1024, 1024, 1024, 4096] + - [17, 12371.0] + - - [4096, 1024, 1, 3394, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3573, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [4096, 1024, 1, 3386, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3540, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3182, 1024, 1024, 1024, 4096] + - [15, 12357.0] + - - [1024, 4096, 1, 3430, 1024, 1024, 1024, 4096] + - [17, 12379.0] + - - [1024, 4096, 1, 3236, 1024, 1024, 1024, 4096] + - [23, 12361.0] + - - [4096, 1024, 1, 2977, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 4096, 1, 3355, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3139, 4096, 4096, 4096, 1024] + - [4, 12356.0] + - - [4096, 1024, 1, 3516, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3368, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 3559, 4096, 4096, 4096, 1024] + - [17, 12379.0] + - - [1024, 4096, 1, 3506, 1024, 1024, 1024, 4096] + - [17, 12363.0] + - - [1024, 4096, 1, 3145, 1024, 1024, 1024, 4096] + - [23, 12357.0] + - - [1024, 4096, 1, 3369, 1024, 1024, 1024, 4096] + - [29, 12362.0] + - - [4096, 1024, 1, 3522, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 33708, 1, 3894, 1024, 1024, 1024, 33708] + - [4, 12749.0] + - - [4096, 1024, 1, 3336, 4096, 4096, 4096, 1024] + - [15, 12360.0] + - - [1024, 4096, 1, 3382, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3533, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 4050, 4096, 4096, 4096, 1024] + - [17, 12375.0] + - - [4096, 1024, 1, 3480, 4096, 4096, 4096, 1024] + - [15, 12361.0] + - - [1024, 4096, 1, 3344, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [1024, 4096, 1, 3509, 1024, 1024, 1024, 4096] + - [17, 12382.0] + - - [1024, 4096, 1, 3956, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [4096, 1024, 1, 3616, 4096, 4096, 4096, 1024] + - [4, 12368.0] + - - [1024, 4096, 1, 3366, 1024, 1024, 1024, 4096] + - [15, 12359.0] + - - [4096, 1024, 1, 2935, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [4096, 1024, 1, 3393, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3547, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3499, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4096, 1024, 1, 3357, 4096, 4096, 4096, 1024] + - [4, 12355.0] + - - [4096, 1024, 1, 3272, 4096, 4096, 4096, 1024] + - [4, 12373.0] + - - [4096, 1024, 1, 3207, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3894, 4096, 4096, 4096, 1024] + - [17, 12373.0] + - - [1024, 4096, 1, 3444, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [4096, 1024, 1, 3561, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3376, 4096, 4096, 4096, 1024] + - [4, 12364.0] + - - [1024, 4096, 1, 3458, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3231, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3505, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3277, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3391, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3536, 1024, 1024, 1024, 4096] + - [4, 12383.0] + - - [1024, 4096, 1, 3063, 1024, 1024, 1024, 4096] + - [11, 12356.0] + - - [1024, 4096, 1, 3189, 1024, 1024, 1024, 4096] + - [17, 12373.0] + - - [1024, 4096, 1, 2505, 1024, 1024, 1024, 4096] + - [23, 12347.0] + - - [4096, 1024, 1, 3454, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3405, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [1024, 33708, 1, 4050, 1024, 1024, 1024, 33708] + - [17, 12749.0] + - - [4096, 1024, 1, 3520, 4096, 4096, 4096, 1024] + - [17, 12372.0] + - - [1024, 4096, 1, 3487, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 3558, 1024, 1024, 1024, 4096] + - [4, 12366.0] + - - [4096, 1024, 1, 3297, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3483, 1024, 1024, 1024, 4096] + - [4, 12364.0] + - - [1024, 33708, 1, 3751, 1024, 1024, 1024, 33708] + - [4, 12750.0] + - - [4096, 1024, 1, 3380, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 4096, 1, 3380, 1024, 1024, 1024, 4096] + - [4, 12356.0] + - - [1024, 4096, 1, 3396, 1024, 1024, 1024, 4096] + - [4, 12359.0] + - - [1024, 4096, 1, 3497, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 3502, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 3138, 1024, 1024, 1024, 4096] + - [15, 12351.0] + - - [4096, 1024, 1, 3939, 4096, 4096, 4096, 1024] + - [4, 12364.0] + - - [1024, 4096, 1, 3303, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 4096, 1, 3418, 1024, 1024, 1024, 4096] + - [17, 12383.0] + - - [1024, 4096, 1, 3224, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3978, 4096, 4096, 4096, 1024] + - [17, 12372.0] + - - [1024, 4096, 1, 3472, 1024, 1024, 1024, 4096] + - [4, 12367.0] + - - [4096, 1024, 1, 3353, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3362, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 33708, 1, 3978, 1024, 1024, 1024, 33708] + - [4, 12753.0] + - - [1024, 4096, 1, 3432, 1024, 1024, 1024, 4096] + - [15, 12362.0] + - - [1024, 4096, 1, 3139, 1024, 1024, 1024, 4096] + - [17, 12351.0] + - - [1024, 4096, 1, 3341, 1024, 1024, 1024, 4096] + - [4, 12371.0] + - - [1024, 4096, 1, 3494, 1024, 1024, 1024, 4096] + - [15, 12371.0] + - - [1024, 4096, 1, 3969, 1024, 1024, 1024, 4096] + - [4, 12369.0] + - - [1024, 4096, 1, 3163, 1024, 1024, 1024, 4096] + - [15, 12353.0] + - - [4096, 1024, 1, 3405, 4096, 4096, 4096, 1024] + - [4, 12355.0] + - - [4096, 1024, 1, 3453, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3411, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [1024, 4096, 1, 3527, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3474, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3572, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [4096, 1024, 1, 3293, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3247, 4096, 4096, 4096, 1024] + - [4, 12370.0] + - - [1024, 4096, 1, 3425, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 3354, 1024, 1024, 1024, 4096] + - [17, 12373.0] + - - [4096, 1024, 1, 3382, 4096, 4096, 4096, 1024] + - [4, 12356.0] + - - [4096, 1024, 1, 3236, 4096, 4096, 4096, 1024] + - [29, 12362.0] + - - [1024, 4096, 1, 3519, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4096, 1024, 1, 3354, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [4096, 1024, 1, 3501, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [4096, 1024, 1, 3266, 4096, 4096, 4096, 1024] + - [4, 12372.0] + - - [1024, 4096, 1, 3368, 1024, 1024, 1024, 4096] + - [15, 12363.0] + - - [1024, 4096, 1, 4030, 1024, 1024, 1024, 4096] + - [29, 12367.0] + - - [1024, 4096, 1, 3533, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3332, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 4096, 1024] + - [17, 12369.0] + - - [1024, 4096, 1, 3616, 1024, 1024, 1024, 4096] + - [17, 12373.0] + - - [4096, 1024, 1, 3265, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 3361, 4096, 4096, 4096, 1024] + - [29, 12371.0] + - - [4096, 1024, 1, 3467, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3454, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [1024, 4096, 1, 3101, 1024, 1024, 1024, 4096] + - [1, 12353.0] + - - [1024, 4096, 1, 3508, 1024, 1024, 1024, 4096] + - [4, 12363.0] + - - [4096, 1024, 1, 3267, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3419, 4096, 4096, 4096, 1024] + - [17, 12379.0] + - - [4096, 1024, 1, 3822, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [1024, 4096, 1, 3266, 1024, 1024, 1024, 4096] + - [17, 12354.0] + - - [4096, 1024, 1, 3440, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3361, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [1024, 4096, 1, 3546, 1024, 1024, 1024, 4096] + - [17, 12366.0] + - - [4096, 1024, 1, 3473, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3546, 4096, 4096, 4096, 1024] + - [1, 12360.0] + - - [1024, 4096, 1, 3088, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [1024, 4096, 1, 3535, 1024, 1024, 1024, 4096] + - [11, 12364.0] + - - [1024, 4096, 1, 3447, 1024, 1024, 1024, 4096] + - [17, 12381.0] + - - [1024, 4096, 1, 3560, 1024, 1024, 1024, 4096] + - [15, 12367.0] + - - [1024, 4096, 1, 3422, 1024, 1024, 1024, 4096] + - [23, 12366.0] + - - [1024, 4096, 1, 3469, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3488, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [1024, 4096, 1, 3110, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [1024, 4096, 1, 3265, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [1024, 4096, 1, 3291, 1024, 1024, 1024, 4096] + - [4, 12352.0] + - - [1024, 4096, 1, 3390, 1024, 1024, 1024, 4096] + - [15, 12367.0] + - - [4096, 1024, 1, 3046, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3539, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [4096, 1024, 1, 3221, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3433, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3364, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [4096, 1024, 1, 3470, 4096, 4096, 4096, 1024] + - [15, 12352.0] + - - [1024, 4096, 1, 3404, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [1024, 33708, 1, 3968, 1024, 1024, 1024, 33708] + - [17, 12752.0] + - - [4096, 1024, 1, 3088, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3247, 1024, 1024, 1024, 4096] + - [15, 12357.0] + - - [1024, 33708, 1, 3996, 1024, 1024, 1024, 33708] + - [17, 12748.0] + - - [4096, 1024, 1, 3482, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 3995, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [1024, 4096, 1, 3280, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [4096, 1024, 1, 3271, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [4096, 1024, 1, 3545, 4096, 4096, 4096, 1024] + - [29, 12372.0] + - - [4096, 1024, 1, 3476, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 3496, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [4096, 1024, 1, 3191, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3311, 4096, 4096, 4096, 1024] + - [4, 12360.0] + - - [1024, 4096, 1, 3302, 1024, 1024, 1024, 4096] + - [4, 12355.0] + - - [1024, 4096, 1, 3681, 1024, 1024, 1024, 4096] + - [11, 12367.0] + - - [4096, 1024, 1, 3582, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3421, 4096, 4096, 4096, 1024] + - [17, 12372.0] + - - [4096, 1024, 1, 3560, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3495, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3186, 4096, 4096, 4096, 1024] + - [17, 12353.0] + - - [4096, 1024, 1, 3925, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3435, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3434, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 33708, 1, 4012, 1024, 1024, 1024, 33708] + - [4, 12748.0] + - - [1024, 4096, 1, 3340, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3489, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3162, 1024, 1024, 1024, 4096] + - [4, 12355.0] + - - [4096, 1024, 1, 3436, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3574, 4096, 4096, 4096, 1024] + - [11, 12358.0] + - - [4096, 1024, 1, 3469, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3410, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [1024, 4096, 1, 3216, 1024, 1024, 1024, 4096] + - [17, 12367.0] + - - [4096, 1024, 1, 3095, 4096, 4096, 4096, 1024] + - [17, 12351.0] + - - [4096, 1024, 1, 3448, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [1024, 4096, 1, 3176, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [4096, 1024, 1, 2918, 4096, 4096, 4096, 1024] + - [1, 12354.0] + - - [1024, 4096, 1, 3424, 1024, 1024, 1024, 4096] + - [17, 12366.0] + - - [4096, 1024, 1, 3402, 4096, 4096, 4096, 1024] + - [1, 12350.0] + - - [4096, 1024, 1, 3145, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 33708, 1, 3976, 1024, 1024, 1024, 33708] + - [4, 12754.0] + - - [4096, 1024, 1, 3518, 4096, 4096, 4096, 1024] + - [15, 12348.0] + - - [4096, 1024, 1, 3110, 4096, 4096, 4096, 1024] + - [29, 12371.0] + - - [4096, 1024, 1, 3325, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 33708, 1, 3999, 1024, 1024, 1024, 33708] + - [29, 12749.0] + - - [4096, 1024, 1, 2985, 4096, 4096, 4096, 1024] + - [4, 12350.0] + - - [1024, 4096, 1, 3371, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3342, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3141, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [4096, 1024, 1, 3532, 4096, 4096, 4096, 1024] + - [17, 12374.0] + - - [1024, 4096, 1, 3169, 1024, 1024, 1024, 4096] + - [17, 12354.0] + - - [1024, 4096, 1, 3514, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [4096, 1024, 1, 3780, 4096, 4096, 4096, 1024] + - [17, 12367.0] + - - [1024, 4096, 1, 3098, 1024, 1024, 1024, 4096] + - [17, 12348.0] + - - [1024, 4096, 1, 3449, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3222, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [1024, 4096, 1, 3346, 1024, 1024, 1024, 4096] + - [4, 12370.0] + - - [4096, 1024, 1, 3064, 4096, 4096, 4096, 1024] + - [1, 12352.0] + - - [4096, 1024, 1, 3511, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 3384, 4096, 4096, 4096, 1024] + - [23, 12364.0] + - - [4096, 1024, 1, 3356, 4096, 4096, 4096, 1024] + - [1, 12355.0] + - - [1024, 4096, 1, 3796, 1024, 1024, 1024, 4096] + - [17, 12367.0] + - - [4096, 1024, 1, 3427, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3390, 4096, 4096, 4096, 1024] + - [35, 12359.0] + - - [4096, 1024, 1, 3573, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3456, 4096, 4096, 4096, 1024] + - [4, 12361.0] + - - [1024, 4096, 1, 3360, 1024, 1024, 1024, 4096] + - [17, 12365.0] + - - [1024, 33708, 1, 3977, 1024, 1024, 1024, 33708] + - [17, 12746.0] + - - [1024, 4096, 1, 2918, 1024, 1024, 1024, 4096] + - [15, 12342.0] + - - [4096, 1024, 1, 3975, 4096, 4096, 4096, 1024] + - [17, 12370.0] + - - [4096, 1024, 1, 3525, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3398, 4096, 4096, 4096, 1024] + - [17, 12376.0] + - - [4096, 1024, 1, 3640, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [4096, 1024, 1, 3014, 4096, 4096, 4096, 1024] + - [23, 12361.0] + - - [1024, 4096, 1, 3446, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [1024, 33708, 1, 3796, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [4096, 1024, 1, 3101, 4096, 4096, 4096, 1024] + - [4, 12350.0] + - - [4096, 1024, 1, 3563, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3539, 4096, 4096, 4096, 1024] + - [29, 12353.0] + - - [4096, 1024, 1, 3182, 4096, 4096, 4096, 1024] + - [17, 12368.0] + - - [1024, 4096, 1, 3468, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3312, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3215, 4096, 4096, 4096, 1024] + - [17, 12351.0] + - - [4096, 1024, 1, 3910, 4096, 4096, 4096, 1024] + - [17, 12373.0] + - - [1024, 33708, 1, 3780, 1024, 1024, 1024, 33708] + - [17, 12746.0] + - - [1024, 4096, 1, 3290, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [1024, 4096, 1, 4012, 1024, 1024, 1024, 4096] + - [4, 12368.0] + - - [1024, 4096, 1, 3385, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 33708, 1, 3975, 1024, 1024, 1024, 33708] + - [17, 12743.0] + - - [4096, 1024, 1, 3996, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [4096, 1024, 1, 2765, 4096, 4096, 4096, 1024] + - [15, 12341.0] + - - [4096, 1024, 1, 3538, 4096, 4096, 4096, 1024] + - [23, 12366.0] + - - [4096, 1024, 1, 3415, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3554, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3513, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3304, 1024, 1024, 1024, 4096] + - [15, 12359.0] + - - [4096, 1024, 1, 3294, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3396, 4096, 4096, 4096, 1024] + - [4, 12369.0] + - - [1024, 4096, 1, 3213, 1024, 1024, 1024, 4096] + - [17, 12350.0] + - - [4096, 1024, 1, 3137, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3552, 4096, 4096, 4096, 1024] + - [23, 12376.0] + - - [1024, 4096, 1, 3461, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [4096, 1024, 1, 3263, 4096, 4096, 4096, 1024] + - [4, 12350.0] + - - [4096, 1024, 1, 3430, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3389, 4096, 4096, 4096, 1024] + - [1, 12360.0] + - - [4096, 1024, 1, 3528, 4096, 4096, 4096, 1024] + - [4, 12374.0] + - - [1024, 4096, 1, 3463, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [4096, 1024, 1, 3526, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [4096, 1024, 1, 3154, 4096, 4096, 4096, 1024] + - [17, 12379.0] + - - [4096, 1024, 1, 3499, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3955, 4096, 4096, 4096, 1024] + - [17, 12372.0] + - - [1024, 4096, 1, 3297, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [1024, 4096, 1, 3233, 1024, 1024, 1024, 4096] + - [29, 12358.0] + - - [1024, 4096, 1, 3226, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [4096, 1024, 1, 3404, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [4096, 1024, 1, 3355, 4096, 4096, 4096, 1024] + - [1, 12360.0] + - - [1024, 4096, 1, 3542, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [4096, 1024, 1, 3181, 4096, 4096, 4096, 1024] + - [17, 12348.0] + - - [1024, 4096, 1, 3474, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [4096, 1024, 1, 3319, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3434, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [1024, 4096, 1, 3860, 1024, 1024, 1024, 4096] + - [4, 12366.0] + - - [1024, 4096, 1, 3343, 1024, 1024, 1024, 4096] + - [15, 12354.0] + - - [1024, 4096, 1, 3488, 1024, 1024, 1024, 4096] + - [4, 12369.0] + - - [1024, 4096, 1, 3046, 1024, 1024, 1024, 4096] + - [17, 12354.0] + - - [1024, 4096, 1, 3141, 1024, 1024, 1024, 4096] + - [4, 12371.0] + - - [1024, 4096, 1, 3516, 1024, 1024, 1024, 4096] + - [29, 12360.0] + - - [4096, 1024, 1, 3147, 4096, 4096, 4096, 1024] + - [15, 12358.0] + - - [1024, 4096, 1, 3421, 1024, 1024, 1024, 4096] + - [15, 12359.0] + - - [4096, 1024, 1, 3944, 4096, 4096, 4096, 1024] + - [15, 12369.0] + - - [1024, 4096, 1, 3574, 1024, 1024, 1024, 4096] + - [15, 12365.0] + - - [1024, 4096, 1, 3977, 1024, 1024, 1024, 4096] + - [4, 12374.0] + - - [1024, 4096, 1, 2985, 1024, 1024, 1024, 4096] + - [17, 12349.0] + - - [1024, 4096, 1, 3427, 1024, 1024, 1024, 4096] + - [17, 12385.0] + - - [1024, 4096, 1, 3482, 1024, 1024, 1024, 4096] + - [23, 12363.0] + - - [1024, 4096, 1, 3332, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [4096, 1024, 1, 3308, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3513, 1024, 1024, 1024, 4096] + - [17, 12365.0] + - - [1024, 4096, 1, 3154, 1024, 1024, 1024, 4096] + - [4, 12355.0] + - - [1024, 4096, 1, 3955, 1024, 1024, 1024, 4096] + - [4, 12378.0] + - - [1024, 4096, 1, 2967, 1024, 1024, 1024, 4096] + - [17, 12350.0] + - - [1024, 33708, 1, 3942, 1024, 1024, 1024, 33708] + - [4, 12749.0] + - - [1024, 4096, 1, 3319, 1024, 1024, 1024, 4096] + - [17, 12354.0] + - - [4096, 1024, 1, 3860, 4096, 4096, 4096, 1024] + - [4, 12364.0] + - - [1024, 4096, 1, 3548, 1024, 1024, 1024, 4096] + - [17, 12370.0] + - - [4096, 1024, 1, 3977, 4096, 4096, 4096, 1024] + - [4, 12361.0] + - - [4096, 1024, 1, 3535, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3541, 1024, 1024, 1024, 4096] + - [17, 12363.0] + - - [1024, 33708, 1, 3584, 1024, 1024, 1024, 33708] + - [29, 12749.0] + - - [1024, 4096, 1, 3168, 1024, 1024, 1024, 4096] + - [17, 12365.0] + - - [1024, 4096, 1, 3448, 1024, 1024, 1024, 4096] + - [17, 12381.0] + - - [4096, 1024, 1, 3343, 4096, 4096, 4096, 1024] + - [17, 12354.0] + - - [1024, 4096, 1, 3357, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [4096, 1024, 1, 3510, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3369, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [4096, 1024, 1, 3379, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3276, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [1024, 4096, 1, 3363, 1024, 1024, 1024, 4096] + - [4, 12373.0] + - - [4096, 1024, 1, 3055, 4096, 4096, 4096, 1024] + - [15, 12352.0] + - - [1024, 4096, 1, 3524, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4096, 1024, 1, 3057, 4096, 4096, 4096, 1024] + - [17, 12351.0] + - - [1024, 33708, 1, 3720, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [1024, 4096, 1, 3383, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 4096, 1, 3522, 1024, 1024, 1024, 4096] + - [15, 12368.0] + - - [1024, 33708, 1, 3956, 1024, 1024, 1024, 33708] + - [29, 12753.0] + - - [1024, 4096, 1, 3481, 1024, 1024, 1024, 4096] + - [15, 12358.0] + - - [4096, 1024, 1, 3562, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3299, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [1024, 4096, 1, 3262, 1024, 1024, 1024, 4096] + - [15, 12352.0] + - - [1024, 33708, 1, 4026, 1024, 1024, 1024, 33708] + - [17, 12748.0] + - - [4096, 1024, 1, 3168, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3999, 1024, 1024, 1024, 4096] + - [17, 12373.0] + - - [1024, 4096, 1, 3549, 1024, 1024, 1024, 4096] + - [17, 12363.0] + - - [4096, 1024, 1, 3375, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [1024, 4096, 1, 3496, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [1024, 4096, 1, 3190, 1024, 1024, 1024, 4096] + - [17, 12354.0] + - - [4096, 1024, 1, 3273, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3406, 1024, 1024, 1024, 4096] + - [15, 12358.0] + - - [4096, 1024, 1, 4005, 4096, 4096, 4096, 1024] + - [17, 12373.0] + - - [4096, 1024, 1, 3555, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 2505, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3460, 1024, 1024, 1024, 4096] + - [17, 12365.0] + - - [1024, 4096, 1, 3579, 1024, 1024, 1024, 4096] + - [29, 12378.0] + - - [1024, 33708, 1, 4030, 1024, 1024, 1024, 33708] + - [17, 12747.0] + - - [1024, 4096, 1, 3510, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [1024, 4096, 1, 3282, 1024, 1024, 1024, 4096] + - [29, 12367.0] + - - [1024, 4096, 1, 3377, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [1024, 4096, 1, 2935, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [1024, 4096, 1, 3498, 1024, 1024, 1024, 4096] + - [4, 12376.0] + - - [1024, 4096, 1, 3593, 1024, 1024, 1024, 4096] + - [11, 12369.0] + - - [4096, 1024, 1, 3226, 4096, 4096, 4096, 1024] + - [4, 12352.0] + - - [1024, 4096, 1, 2499, 1024, 1024, 1024, 4096] + - [17, 12347.0] + - - [1024, 4096, 1, 3296, 1024, 1024, 1024, 4096] + - [15, 12362.0] + - - [1024, 4096, 1, 3455, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [1024, 4096, 1, 3399, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3205, 1024, 1024, 1024, 4096] + - [4, 12369.0] + - - [4096, 1024, 1, 4026, 4096, 4096, 4096, 1024] + - [17, 12369.0] + - - [1024, 4096, 1, 3484, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [4096, 1024, 1, 3302, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3485, 1024, 1024, 1024, 4096] + - [17, 12369.0] + - - [1024, 4096, 1, 3126, 1024, 1024, 1024, 4096] + - [23, 12353.0] + - - [1024, 4096, 1, 4050, 1024, 1024, 1024, 4096] + - [17, 12373.0] + - - [4096, 1024, 1, 3235, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 33708, 1, 3955, 1024, 1024, 1024, 33708] + - [17, 12749.0] + - - [1024, 4096, 1, 3342, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [1024, 4096, 1, 3397, 1024, 1024, 1024, 4096] + - [1, 12364.0] + - - [4096, 1024, 1, 3491, 4096, 4096, 4096, 1024] + - [1, 12355.0] + - - [1024, 4096, 1, 3503, 1024, 1024, 1024, 4096] + - [17, 12357.0] + - - [1024, 4096, 1, 3140, 1024, 1024, 1024, 4096] + - [4, 12364.0] + - - [4096, 1024, 1, 3121, 4096, 4096, 4096, 1024] + - [17, 12350.0] + - - [4096, 1024, 1, 3276, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3321, 1024, 1024, 1024, 4096] + - [17, 12354.0] + - - [1024, 4096, 1, 3870, 1024, 1024, 1024, 4096] + - [4, 12363.0] + - - [4096, 1024, 1, 3475, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 2984, 1024, 1024, 1024, 4096] + - [15, 12355.0] + - - [4096, 1024, 1, 3363, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 4096, 1, 3582, 1024, 1024, 1024, 4096] + - [17, 12363.0] + - - [4096, 1024, 1, 3509, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [1024, 4096, 1, 3426, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [4096, 1024, 1, 3136, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3232, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3103, 4096, 4096, 4096, 1024] + - [29, 12347.0] + - - [1024, 4096, 1, 3335, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 4096, 1, 3900, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [4096, 1024, 1, 3512, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3222, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [1024, 4096, 1, 3165, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4096, 1024, 1, 3408, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [4096, 1024, 1, 3751, 4096, 4096, 4096, 1024] + - [1, 12360.0] + - - [1024, 4096, 1, 3318, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [4096, 1024, 1, 3442, 4096, 4096, 4096, 1024] + - [29, 12357.0] + - - [1024, 4096, 1, 3413, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [4096, 1024, 1, 3524, 4096, 4096, 4096, 1024] + - [17, 12369.0] + - - [1024, 4096, 1, 3976, 1024, 1024, 1024, 4096] + - [17, 12368.0] + - - [1024, 4096, 1, 3475, 1024, 1024, 1024, 4096] + - [17, 12360.0] + - - [1024, 4096, 1, 3534, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3301, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [4096, 1024, 1, 3248, 4096, 4096, 4096, 1024] + - [1, 12366.0] + - - [1024, 4096, 1, 2977, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [4096, 1024, 1, 3346, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3451, 1024, 1024, 1024, 4096] + - [15, 12360.0] + - - [1024, 4096, 1, 3257, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [1024, 4096, 1, 3356, 1024, 1024, 1024, 4096] + - [29, 12363.0] + - - [4096, 1024, 1, 3348, 4096, 4096, 4096, 1024] + - [29, 12357.0] + - - [4096, 1024, 1, 3335, 4096, 4096, 4096, 1024] + - [17, 12356.0] + - - [4096, 1024, 1, 3505, 4096, 4096, 4096, 1024] + - [17, 12361.0] + - - [1024, 4096, 1, 3490, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [4096, 1024, 1, 3447, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [1024, 4096, 1, 3267, 1024, 1024, 1024, 4096] + - [15, 12361.0] + - - [4096, 1024, 1, 3230, 4096, 4096, 4096, 1024] + - [29, 12352.0] + - - [4096, 1024, 1, 3455, 4096, 4096, 4096, 1024] + - [23, 12369.0] + - - [1024, 4096, 1, 3925, 1024, 1024, 1024, 4096] + - [4, 12370.0] + - - [1024, 4096, 1, 3362, 1024, 1024, 1024, 4096] + - [17, 12358.0] + - - [4096, 1024, 1, 3969, 4096, 4096, 4096, 1024] + - [4, 12372.0] + - - [4096, 1024, 1, 3527, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3585, 1024, 1024, 1024, 4096] + - [17, 12364.0] + - - [4096, 1024, 1, 3063, 4096, 4096, 4096, 1024] + - [15, 12347.0] + - - [4096, 1024, 1, 3435, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3366, 4096, 4096, 4096, 1024] + - [4, 12368.0] + - - [4096, 1024, 1, 3581, 4096, 4096, 4096, 1024] + - [23, 12355.0] + - - [1024, 33708, 1, 3906, 1024, 1024, 1024, 33708] + - [4, 12750.0] + - - [1024, 4096, 1, 3464, 1024, 1024, 1024, 4096] + - [15, 12358.0] + - - [1024, 4096, 1, 3440, 1024, 1024, 1024, 4096] + - [4, 12369.0] + - - [4096, 1024, 1, 3143, 4096, 4096, 4096, 1024] + - [4, 12354.0] + - - [1024, 4096, 1, 3349, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3416, 4096, 4096, 4096, 1024] + - [15, 12360.0] + - - [4096, 1024, 1, 3365, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [1024, 4096, 1, 3470, 1024, 1024, 1024, 4096] + - [15, 12361.0] + - - [4096, 1024, 1, 3287, 4096, 4096, 4096, 1024] + - [17, 12358.0] + - - [1024, 4096, 1, 3441, 1024, 1024, 1024, 4096] + - [17, 12366.0] + - - [4096, 1024, 1, 3224, 4096, 4096, 4096, 1024] + - [15, 12357.0] + - - [1024, 4096, 1, 3387, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [1024, 4096, 1, 3547, 1024, 1024, 1024, 4096] + - [4, 12363.0] + - - [4096, 1024, 1, 3478, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [4096, 1024, 1, 3548, 4096, 4096, 4096, 1024] + - [17, 12364.0] + - - [1024, 33708, 1, 4020, 1024, 1024, 1024, 33708] + - [17, 12750.0] + - - [4096, 1024, 1, 3320, 4096, 4096, 4096, 1024] + - [17, 12359.0] + - - [1024, 4096, 1, 3906, 1024, 1024, 1024, 4096] + - [17, 12378.0] + - - [4096, 1024, 1, 3796, 4096, 4096, 4096, 1024] + - [17, 12373.0] + - - [1024, 4096, 1, 3306, 1024, 1024, 1024, 4096] + - [17, 12356.0] + - - [1024, 4096, 1, 3401, 1024, 1024, 1024, 4096] + - [4, 12363.0] + - - [1024, 4096, 1, 3215, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [4096, 1024, 1, 4012, 4096, 4096, 4096, 1024] + - [17, 12372.0] + - - [1024, 4096, 1, 2765, 1024, 1024, 1024, 4096] + - [15, 12340.0] + - - [4096, 1024, 1, 3554, 4096, 4096, 4096, 1024] + - [17, 12360.0] + - - [4096, 1024, 1, 3423, 4096, 4096, 4096, 1024] + - [15, 12362.0] + - - [1024, 4096, 1, 3562, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3489, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3358, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3270, 4096, 4096, 4096, 1024] + - [29, 12378.0] + - - [1024, 4096, 1, 3293, 1024, 1024, 1024, 4096] + - [15, 12356.0] + - - [1024, 4096, 1, 3376, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [4096, 1024, 1, 3245, 4096, 4096, 4096, 1024] + - [17, 12357.0] + - - [4096, 1024, 1, 3541, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3443, 4096, 4096, 4096, 1024] + - [17, 12366.0] + - - [4096, 1024, 1, 3438, 4096, 4096, 4096, 1024] + - [15, 12361.0] + - - [4096, 1024, 1, 3244, 4096, 4096, 4096, 1024] + - [17, 12355.0] + - - [1024, 4096, 1, 3365, 1024, 1024, 1024, 4096] + - [17, 12362.0] + - - [1024, 4096, 1, 3299, 1024, 1024, 1024, 4096] + - [29, 12360.0] + - - [1024, 4096, 1, 3471, 1024, 1024, 1024, 4096] + - [15, 12358.0] + - - [1024, 4096, 1, 3398, 1024, 1024, 1024, 4096] + - [17, 12361.0] + - - [4096, 1024, 1, 3162, 4096, 4096, 4096, 1024] + - [17, 12351.0] + - - [1024, 4096, 1, 4005, 1024, 1024, 1024, 4096] + - [4, 12376.0] + - - [4096, 1024, 1, 3579, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [1024, 4096, 1, 3121, 1024, 1024, 1024, 4096] + - [15, 12350.0] + - - [4096, 1024, 1, 3441, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [4096, 1024, 1, 3422, 4096, 4096, 4096, 1024] + - [17, 12363.0] + - - [4096, 1024, 1, 3444, 4096, 4096, 4096, 1024] + - [23, 12368.0] + - - [1024, 4096, 1, 3337, 1024, 1024, 1024, 4096] + - [17, 12359.0] + - - [4096, 1024, 1, 3550, 4096, 4096, 4096, 1024] + - [17, 12362.0] + - - [1024, 4096, 1, 3477, 1024, 1024, 1024, 4096] + - [17, 12366.0] + - - [4096, 1024, 1, 3490, 4096, 4096, 4096, 1024] + - [17, 12365.0] + - - [4096, 1024, 1, 3585, 4096, 4096, 4096, 1024] + - [17, 12369.0] + - - [1024, 4096, 1, 3143, 1024, 1024, 1024, 4096] + - [17, 12355.0] + - - [1024, 33708, 1, 3876, 1024, 1024, 1024, 33708] + - [29, 12750.0] + - - [1024, 4096, 1, 3320, 1024, 1024, 1024, 4096] + - [15, 12364.0] + - - [1024, 4096, 1, 3423, 1024, 1024, 1024, 4096] + - [17, 12376.0] + - - [1024, 4096, 1, 3894, 1024, 1024, 1024, 4096] + - [15, 12368.0] + - - [4096, 1024, 1, 3410, 4096, 4096, 4096, 1024] + - [17, 12353.0] + - - [1024, 4096, 1, 3561, 1024, 1024, 1024, 4096] + - [17, 12365.0] + - - [4096, 1024, 1, 3492, 4096, 4096, 4096, 1024] + - [29, 12356.0] + - - [36548, 1024, 1, 3712, 36548, 36548, 36548, 1024] + - [17, 12741.0] + - - [4096, 2048, 1, 128, 4096, 4096, 4096, 2048] + - [0, 11541.0] + - - [4096, 3072, 1, 128, 4096, 4096, 4096, 3072] + - [11, 11855.0] + - - [768, 3072, 1, 4096, 768, 768, 768, 3072] + - [3, 11512.0] + - - [768, 30522, 1, 1280, 768, 768, 768, 30522] + - [17, 12674.0] + - - [768, 30522, 1, 320, 768, 768, 768, 30522] + - [17, 12439.0] + - - [768, 30522, 1, 640, 768, 768, 768, 30522] + - [29, 12609.0] + - - [256, 512, 36, 98, 256, 256, 256, 512] + - [14, 10968.0] + - - [256, 256, 64, 56, 256, 256, 256, 256] + - [0, 10203.0] + - - [512, 486, 36, 800, 512, 512, 512, 486] + - [1, 11764.0] + - - [512, 512, 36, 1568, 512, 512, 512, 512] + - [29, 12600.0] + - - [256, 384, 36, 4096, 256, 256, 256, 384] + - [29, 12279.0] + - - [128, 256, 64, 32, 128, 128, 128, 256] + - [14, 5202.0] + - - [128, 256, 64, 9, 128, 128, 128, 256] + - [18, 1774.0] + - - [256, 512, 36, 784, 256, 256, 256, 512] + - [4, 11907.0] + - - [256, 324, 36, 32, 256, 256, 256, 324] + - [0, 6253.0] + - - [512, 512, 36, 33, 512, 512, 512, 512] + - [0, 7732.0] + - - [192, 384, 64, 128, 192, 192, 192, 384] + - [0, 8379.0] + - - [512, 512, 64, 72, 512, 512, 512, 512] + - [14, 11617.0] + - - [512, 512, 36, 128, 512, 512, 512, 512] + - [18, 11792.0] + - - [192, 384, 64, 2304, 192, 192, 192, 384] + - [29, 9083.0] + - - [384, 256, 64, 450, 384, 384, 384, 256] + - [29, 11787.0] + - - [384, 256, 64, 2304, 384, 384, 384, 256] + - [17, 12148.0] + - - [512, 512, 64, 144, 512, 512, 512, 512] + - [18, 11913.0] + - - [256, 256, 36, 6272, 256, 256, 256, 256] + - [3, 11519.0] + - - [256, 384, 64, 2304, 256, 256, 256, 384] + - [4, 12149.0] + - - [512, 512, 36, 66, 512, 512, 512, 512] + - [0, 10731.0] + - - [128, 256, 64, 800, 128, 128, 128, 256] + - [28, 11457.0] + - - [192, 256, 36, 512, 192, 192, 192, 256] + - [14, 8415.0] + - - [256, 512, 64, 200, 256, 256, 256, 512] + - [15, 11975.0] + - - [256, 512, 64, 25, 256, 256, 256, 512] + - [20, 5650.0] + - - [128, 256, 36, 1568, 128, 128, 128, 256] + - [30, 10822.0] + - - [128, 256, 64, 288, 128, 128, 128, 256] + - [25, 11256.0] + - - [256, 384, 64, 1152, 256, 256, 256, 384] + - [4, 12070.0] + - - [160, 320, 64, 288, 160, 160, 160, 320] + - [25, 7168.0] + - - [128, 256, 36, 128, 128, 128, 128, 256] + - [5, 9508.0] + - - [512, 512, 36, 16, 512, 512, 512, 512] + - [8, 3821.0] + - - [384, 256, 36, 800, 384, 384, 384, 256] + - [15, 12055.0] + - - [192, 384, 36, 4096, 192, 192, 192, 384] + - [17, 9173.0] + - - [256, 384, 64, 576, 256, 256, 256, 384] + - [17, 11897.0] + - - [512, 512, 64, 14, 512, 512, 512, 512] + - [0, 3928.0] + - - [512, 512, 36, 8, 512, 512, 512, 512] + - [0, 1989.0] + - - [512, 486, 64, 128, 512, 512, 512, 486] + - [8, 11144.0] + - - [256, 256, 36, 128, 256, 256, 256, 256] + - [25, 10724.0] + - - [256, 256, 36, 32, 256, 256, 256, 256] + - [14, 6198.0] + - - [192, 256, 64, 288, 192, 192, 192, 256] + - [5, 8668.0] + - - [256, 256, 36, 16, 256, 256, 256, 256] + - [3, 3288.0] + - - [128, 256, 36, 3200, 128, 128, 128, 256] + - [30, 10882.0] + - - [160, 320, 64, 512, 160, 160, 160, 320] + - [14, 7437.0] + - - [160, 320, 36, 512, 160, 160, 160, 320] + - [0, 7332.0] + - - [256, 512, 36, 4, 256, 256, 256, 512] + - [27, 953.0] + - - [256, 324, 64, 1568, 256, 256, 256, 324] + - [29, 10182.0] + - - [256, 256, 36, 3200, 256, 256, 256, 256] + - [3, 11474.0] + - - [256, 256, 36, 210, 256, 256, 256, 256] + - [14, 10794.0] + - - [192, 384, 64, 576, 192, 192, 192, 384] + - [1, 8883.0] + - - [512, 512, 64, 800, 512, 512, 512, 512] + - [17, 12522.0] + - - [256, 256, 64, 1152, 256, 256, 256, 256] + - [29, 12180.0] + - - [512, 486, 64, 512, 512, 512, 512, 486] + - [1, 11719.0] + - - [256, 512, 64, 1600, 256, 256, 256, 512] + - [17, 12475.0] + - - [512, 512, 64, 9, 512, 512, 512, 512] + - [0, 2489.0] + - - [256, 512, 36, 1568, 256, 256, 256, 512] + - [29, 12029.0] + - - [128, 256, 64, 3200, 128, 128, 128, 256] + - [28, 11672.0] + - - [256, 512, 64, 4, 256, 256, 256, 512] + - [0, 1084.0] + - - [256, 256, 64, 450, 256, 256, 256, 256] + - [9, 11908.0] + - - [256, 256, 64, 72, 256, 256, 256, 256] + - [25, 10824.0] + - - [128, 256, 36, 3136, 128, 128, 128, 256] + - [36, 10874.0] + - - [160, 320, 64, 242, 160, 160, 160, 320] + - [0, 7097.0] + - - [512, 512, 36, 512, 512, 512, 512, 512] + - [4, 12407.0] + - - [512, 512, 36, 256, 512, 512, 512, 512] + - [15, 12161.0] + - - [512, 512, 36, 1024, 512, 512, 512, 512] + - [4, 12552.0] + - - [256, 256, 36, 4096, 256, 256, 256, 256] + - [3, 11497.0] + - - [256, 256, 64, 896, 256, 256, 256, 256] + - [4, 12106.0] + - - [128, 256, 64, 242, 128, 128, 128, 256] + - [14, 10999.0] + - - [192, 384, 36, 1024, 192, 192, 192, 384] + - [18, 9007.0] + - - [128, 256, 64, 100, 128, 128, 128, 256] + - [31, 10073.0] + - - [384, 256, 64, 1152, 384, 384, 384, 256] + - [17, 12072.0] + - - [192, 384, 36, 128, 192, 192, 192, 384] + - [0, 7778.0] + - - [128, 256, 64, 1568, 128, 128, 128, 256] + - [3, 11549.0] + - - [128, 256, 64, 72, 128, 128, 128, 256] + - [25, 9298.0] + - - [256, 256, 36, 12544, 256, 256, 256, 256] + - [28, 11535.0] + - - [256, 256, 36, 105, 256, 256, 256, 256] + - [25, 10128.0] + - - [128, 256, 36, 392, 128, 128, 128, 256] + - [5, 10331.0] + - - [384, 256, 36, 1024, 384, 384, 384, 256] + - [17, 12053.0] + - - [128, 256, 64, 1152, 128, 128, 128, 256] + - [28, 11490.0] + - - [256, 324, 64, 32, 256, 256, 256, 324] + - [0, 7119.0] + - - [256, 384, 36, 800, 256, 256, 256, 384] + - [15, 12075.0] + - - [512, 512, 64, 4, 512, 512, 512, 512] + - [0, 1058.0] + - - [192, 320, 36, 128, 192, 192, 192, 320] + - [8, 7693.0] + - - [192, 384, 64, 242, 192, 192, 192, 384] + - [25, 8684.0] + - - [256, 486, 64, 32, 256, 256, 256, 486] + - [0, 7194.0] + - - [512, 512, 64, 64, 512, 512, 512, 512] + - [0, 11467.0] + - - [128, 256, 36, 512, 128, 128, 128, 256] + - [18, 10537.0] + - - [512, 512, 64, 576, 512, 512, 512, 512] + - [17, 12470.0] + - - [256, 256, 64, 9, 256, 256, 256, 256] + - [0, 2226.0] + - - [128, 256, 36, 12544, 128, 128, 128, 256] + - [6, 11059.0] + - - [256, 512, 36, 3136, 256, 256, 256, 512] + - [17, 12103.0] + - - [144, 288, 36, 512, 144, 144, 144, 288] + - [25, 5909.0] + - - [384, 384, 36, 800, 384, 384, 384, 384] + - [4, 11852.0] + - - [512, 512, 64, 1600, 512, 512, 512, 512] + - [29, 12580.0] + - - [512, 512, 36, 4, 512, 512, 512, 512] + - [0, 1050.0] + - - [192, 384, 64, 450, 192, 192, 192, 384] + - [9, 8800.0] + - - [256, 256, 36, 1024, 256, 256, 256, 256] + - [28, 11316.0] + - - [256, 512, 64, 400, 256, 256, 256, 512] + - [1, 12164.0] + - - [128, 256, 36, 6272, 128, 128, 128, 256] + - [6, 10996.0] + - - [256, 256, 36, 512, 256, 256, 256, 256] + - [25, 11195.0] + - - [256, 256, 64, 112, 256, 256, 256, 256] + - [25, 11265.0] + - - [512, 512, 64, 18, 512, 512, 512, 512] + - [1, 4800.0] + - - [256, 256, 64, 18, 256, 256, 256, 256] + - [0, 3888.0] + - - [256, 256, 64, 1568, 256, 256, 256, 256] + - [4, 12248.0] + - - [384, 256, 36, 4096, 384, 384, 384, 256] + - [29, 12282.0] + - - [256, 512, 64, 800, 256, 256, 256, 512] + - [4, 12376.0] + - - [256, 384, 36, 2048, 256, 256, 256, 384] + - [17, 12193.0] + - - [384, 384, 64, 2304, 384, 384, 384, 384] + - [17, 12628.0] + - - [160, 320, 64, 128, 160, 160, 160, 320] + - [20, 6831.0] + - - [512, 512, 36, 528, 512, 512, 512, 512] + - [29, 12401.0] + - - [160, 320, 36, 128, 160, 160, 160, 320] + - [25, 6514.0] + - - [256, 512, 36, 49, 256, 256, 256, 512] + - [25, 8990.0] + - - [384, 384, 64, 450, 384, 384, 384, 384] + - [29, 12307.0] + - - [256, 256, 64, 3200, 256, 256, 256, 256] + - [4, 12353.0] + - - [512, 512, 64, 8, 512, 512, 512, 512] + - [0, 2170.0] + - - [512, 512, 64, 288, 512, 512, 512, 512] + - [29, 12287.0] + - - [384, 384, 36, 1024, 384, 384, 384, 384] + - [29, 11903.0] + - - [128, 256, 36, 16, 128, 128, 128, 256] + - [2, 2483.0] + - - [256, 256, 64, 288, 256, 256, 256, 256] + - [1, 11820.0] + - - [256, 384, 36, 1024, 256, 256, 256, 384] + - [4, 12073.0] + - - [256, 324, 36, 3200, 256, 256, 256, 324] + - [29, 10303.0] + - - [192, 384, 64, 512, 192, 192, 192, 384] + - [26, 8844.0] + - - [128, 256, 64, 1600, 128, 128, 128, 256] + - [3, 11552.0] + - - [512, 512, 36, 32, 512, 512, 512, 512] + - [0, 8166.0] + - - [512, 512, 36, 3136, 512, 512, 512, 512] + - [4, 12644.0] + - - [128, 256, 64, 6400, 128, 128, 128, 256] + - [28, 11758.0] + - - [256, 256, 36, 2048, 256, 256, 256, 256] + - [16, 11463.0] + - - [256, 256, 64, 6400, 256, 256, 256, 256] + - [17, 12380.0] + - - [256, 256, 36, 1680, 256, 256, 256, 256] + - [28, 11415.0] + - - [192, 384, 36, 2048, 192, 192, 192, 384] + - [29, 9096.0] + - - [256, 256, 64, 144, 256, 256, 256, 256] + - [14, 11260.0] + - - [384, 384, 36, 4096, 384, 384, 384, 384] + - [4, 12061.0] + - - [160, 320, 64, 1152, 160, 160, 160, 320] + - [3, 7501.0] + - - [384, 256, 36, 2048, 384, 384, 384, 256] + - [4, 12197.0] + - - [256, 512, 36, 392, 256, 256, 256, 512] + - [14, 11765.0] + - - [256, 512, 64, 50, 256, 256, 256, 512] + - [24, 9629.0] + - - [384, 384, 36, 2048, 384, 384, 384, 384] + - [29, 12018.0] + - - [256, 384, 64, 450, 256, 256, 256, 384] + - [1, 11782.0] + - - [192, 320, 64, 128, 192, 192, 192, 320] + - [25, 8240.0] + - - [128, 256, 36, 32, 128, 128, 128, 256] + - [27, 5272.0] + - - [512, 512, 64, 256, 512, 512, 512, 512] + - [4, 12251.0] + - - [256, 512, 64, 32, 256, 256, 256, 512] + - [0, 7781.0] + - - [384, 384, 64, 576, 384, 384, 384, 384] + - [17, 12430.0] + - - [512, 486, 36, 288, 512, 512, 512, 486] + - [30, 11294.0] + - - [144, 288, 64, 242, 144, 144, 144, 288] + - [31, 5755.0] + - - [384, 256, 64, 576, 384, 384, 384, 256] + - [29, 11904.0] + - - [512, 512, 36, 64, 512, 512, 512, 512] + - [14, 11098.0] + - - [448, 384, 64, 128, 448, 448, 448, 384] + - [20, 10180.0] + - - [144, 288, 64, 288, 144, 144, 144, 288] + - [3, 5862.0] + - - [512, 512, 64, 224, 512, 512, 512, 512] + - [1, 12202.0] + - - [384, 384, 64, 1152, 384, 384, 384, 384] + - [4, 12560.0] + - - [448, 384, 36, 128, 448, 448, 448, 384] + - [31, 9867.0] + - - [256, 486, 36, 128, 256, 256, 256, 486] + - [14, 10692.0] + - - [256, 256, 36, 800, 256, 256, 256, 256] + - [14, 11337.0] + - - [192, 384, 36, 800, 192, 192, 192, 384] + - [36, 8935.0] + - - [256, 256, 36, 256, 256, 256, 256, 256] + - [25, 11066.0] + - - [192, 384, 64, 1152, 192, 192, 192, 384] + - [17, 9005.0] + - - [128, 256, 64, 200, 128, 128, 128, 256] + - [25, 10951.0] + - - [512, 512, 64, 28, 512, 512, 512, 512] + - [1, 7144.0] + - - [144, 288, 64, 1152, 144, 144, 144, 288] + - [16, 6074.0] + - - [256, 256, 64, 576, 256, 256, 256, 256] + - [18, 11978.0] + - - [256, 256, 64, 2304, 256, 256, 256, 256] + - [4, 12304.0] + - - [192, 384, 36, 512, 192, 192, 192, 384] + - [30, 8824.0] + - - [256, 512, 36, 32, 256, 256, 256, 512] + - [20, 6693.0] + - - [512, 512, 64, 128, 512, 512, 512, 512] + - [4, 11942.0] + - - [512, 512, 64, 32, 512, 512, 512, 512] + - [0, 8996.0] + - - [128, 256, 36, 196, 128, 128, 128, 256] + - [5, 9780.0] + - - [196, 528, 32, 32, 196, 196, 196, 528] + - [8, 3984.0] + - - [196, 512, 32, 24, 196, 196, 196, 512] + - [14, 3339.0] + - - [1225, 192, 32, 32, 1225, 1225, 1225, 192] + - [25, 8044.0] + - - [1001, 1536, 1, 32, 1001, 1001, 1001, 1536] + - [25, 5360.0] + - - [196, 480, 32, 64, 196, 196, 196, 480] + - [31, 5928.0] + - - [289, 1024, 32, 384, 289, 289, 289, 1024] + - [26, 9157.0] + - - [784, 192, 32, 96, 784, 784, 784, 192] + - [14, 9814.0] + - - [50176, 256, 1, 128, 50176, 50176, 50176, 256] + - [30, 11800.0] + - - [289, 1024, 32, 256, 289, 289, 289, 1024] + - [15, 9049.0] + - - [289, 1024, 32, 192, 289, 289, 289, 1024] + - [12, 8907.0] + - - [12544, 512, 1, 256, 12544, 12544, 12544, 512] + - [26, 11940.0] + - - [1225, 1728, 1, 192, 1225, 1225, 1225, 1728] + - [8, 10258.0] + - - [196, 480, 32, 96, 196, 196, 196, 480] + - [0, 6918.0] + - - [196, 512, 32, 144, 196, 196, 196, 512] + - [31, 8205.0] + - - [289, 768, 32, 128, 289, 289, 289, 768] + - [18, 8648.0] + - - [5329, 576, 1, 96, 5329, 5329, 5329, 576] + - [14, 10615.0] + - - [196, 528, 32, 128, 196, 196, 196, 528] + - [14, 7481.0] + - - [5329, 448, 1, 64, 5329, 5329, 5329, 448] + - [0, 8967.0] + - - [784, 256, 32, 64, 784, 784, 784, 256] + - [14, 9617.0] + - - [784, 192, 32, 32, 784, 784, 784, 192] + - [20, 7638.0] + - - [21609, 288, 1, 32, 21609, 21609, 21609, 288] + - [31, 7398.0] + - - [784, 256, 32, 32, 784, 784, 784, 256] + - [14, 8003.0] + - - [5041, 720, 1, 192, 5041, 5041, 5041, 720] + - [0, 10552.0] + - - [196, 512, 32, 128, 196, 196, 196, 512] + - [31, 8063.0] + - - [289, 768, 32, 160, 289, 289, 289, 768] + - [30, 8823.0] + - - [1001, 4096, 1, 512, 1001, 1001, 1001, 4096] + - [1, 11712.0] + - - [1225, 192, 32, 64, 1225, 1225, 1225, 192] + - [25, 10368.0] + - - [784, 192, 32, 16, 784, 784, 784, 192] + - [0, 4660.0] + - - [3136, 1024, 1, 2048, 3136, 3136, 3136, 1024] + - [26, 12099.0] + - - [784, 256, 32, 128, 784, 784, 784, 256] + - [20, 10099.0] + - - [196, 512, 32, 32, 196, 196, 196, 512] + - [33, 4114.0] + - - [1225, 384, 32, 96, 1225, 1225, 1225, 384] + - [0, 11089.0] + - - [5041, 576, 1, 96, 5041, 5041, 5041, 576] + - [25, 10503.0] + - - [5329, 160, 32, 64, 5329, 5329, 5329, 160] + - [0, 8100.0] + - - [1225, 288, 32, 48, 1225, 1225, 1225, 288] + - [31, 8198.0] + - - [4096, 9216, 1, 512, 4096, 4096, 4096, 9216] + - [17, 12651.0] + - - [196, 480, 32, 192, 196, 196, 196, 480] + - [8, 7977.0] + - - [3136, 1024, 1, 512, 3136, 3136, 3136, 1024] + - [5, 11823.0] + - - [784, 192, 32, 64, 784, 784, 784, 192] + - [25, 9568.0] + - - [289, 1024, 32, 128, 289, 289, 289, 1024] + - [14, 8739.0] + - - [289, 768, 32, 192, 289, 289, 289, 768] + - [5, 8911.0] + - - [196, 512, 32, 112, 196, 196, 196, 512] + - [31, 7912.0] + - - [1001, 2048, 1, 32, 1001, 1001, 1001, 2048] + - [20, 6063.0] + - - [1225, 288, 32, 64, 1225, 1225, 1225, 288] + - [25, 9125.0] + - - [1225, 384, 32, 192, 1225, 1225, 1225, 384] + - [1, 11356.0] + - - [50176, 256, 1, 512, 50176, 50176, 50176, 256] + - [29, 12309.0] + - - [196, 512, 32, 160, 196, 196, 196, 512] + - [25, 8365.0] + - - [4096, 4096, 1, 512, 4096, 4096, 4096, 4096] + - [17, 12481.0] + - - [1225, 256, 32, 64, 1225, 1225, 1225, 256] + - [25, 10457.0] + - - [196, 480, 32, 16, 196, 196, 196, 480] + - [25, 2230.0] + - - [1225, 256, 32, 48, 1225, 1225, 1225, 256] + - [0, 9818.0] + - - [1225, 1200, 1, 64, 1225, 1225, 1225, 1200] + - [25, 7599.0] + - - [1225, 384, 32, 64, 1225, 1225, 1225, 384] + - [0, 10659.0] + - - [12544, 512, 1, 1024, 12544, 12544, 12544, 512] + - [26, 12334.0] + - - [196, 512, 32, 64, 196, 196, 196, 512] + - [8, 6324.0] + - - [196, 528, 32, 256, 196, 196, 196, 528] + - [25, 7876.0] + - - [196, 528, 32, 160, 196, 196, 196, 528] + - [20, 7708.0] + - - [1225, 192, 32, 48, 1225, 1225, 1225, 192] + - [20, 9603.0] + - - [1001, 2048, 1, 64, 1001, 1001, 1001, 2048] + - [31, 8553.0] + - - [289, 768, 128, 128, 289, 289, 289, 768] + - [0, 8941.0] + - - [1225, 192, 128, 64, 1225, 1225, 1225, 192] + - [0, 7830.0] + - - [1225, 288, 128, 48, 1225, 1225, 1225, 288] + - [13, 5445.0] + - - [289, 768, 128, 192, 289, 289, 289, 768] + - [15, 9113.0] + - - [289, 768, 128, 160, 289, 289, 289, 768] + - [25, 9022.0] + - - [1225, 256, 128, 48, 1225, 1225, 1225, 256] + - [0, 7280.0] + - - [1225, 192, 128, 48, 1225, 1225, 1225, 192] + - [7, 5535.0] + - - [1225, 288, 128, 64, 1225, 1225, 1225, 288] + - [3, 6933.0] + - - [1225, 256, 128, 64, 1225, 1225, 1225, 256] + - [3, 6231.0] + - - [1001, 2048, 1, 128, 1001, 1001, 1001, 2048] + - [14, 9828.0] + - - [1225, 192, 128, 32, 1225, 1225, 1225, 192] + - [0, 4204.0] + - - [1001, 1536, 1, 64, 1001, 1001, 1001, 1536] + - [14, 7225.0] + - - [1024, 4096, 1, 64, 1024, 1024, 1024, 4096] + - [0, 10421.0] + - - [1024, 4096, 1, 6336, 1024, 1024, 1024, 4096] + - [4, 12394.0] + - - [512, 33708, 1, 3780, 512, 512, 512, 33708] + - [4, 12704.0] + - - [512, 33708, 1, 3968, 512, 512, 512, 33708] + - [29, 12703.0] + - - [512, 33708, 1, 4030, 512, 512, 512, 33708] + - [4, 12704.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [11, 9202.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [18, 9120.0] + - - [289, 768, 64, 128, 289, 289, 289, 768] + - [14, 8828.0] + - - [289, 768, 64, 160, 289, 289, 289, 768] + - [5, 8933.0] + - - [289, 768, 64, 192, 289, 289, 289, 768] + - [18, 9023.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [26, 10478.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [5, 10503.0] + - - [1225, 192, 64, 32, 1225, 1225, 1225, 192] + - [14, 8137.0] + - - [1225, 192, 64, 48, 1225, 1225, 1225, 192] + - [25, 10005.0] + - - [1225, 192, 64, 64, 1225, 1225, 1225, 192] + - [14, 10680.0] + - - [1225, 256, 64, 48, 1225, 1225, 1225, 256] + - [0, 10084.0] + - - [1225, 256, 64, 64, 1225, 1225, 1225, 256] + - [0, 10505.0] + - - [1225, 288, 64, 48, 1225, 1225, 1225, 288] + - [0, 8167.0] + - - [1225, 288, 64, 64, 1225, 1225, 1225, 288] + - [25, 8888.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [0, 9628.0] + - - [256, 44505, 1, 8976, 256, 256, 256, 44505] + - [17, 12673.0] + - - [512, 33708, 1, 3796, 512, 512, 512, 33708] + - [17, 12703.0] + - - [512, 33708, 1, 3822, 512, 512, 512, 33708] + - [17, 12704.0] + - - [512, 33708, 1, 3840, 512, 512, 512, 33708] + - [29, 12706.0] + - - [512, 33708, 1, 3859, 512, 512, 512, 33708] + - [29, 12709.0] + - - [512, 33708, 1, 3870, 512, 512, 512, 33708] + - [4, 12706.0] + - - [512, 33708, 1, 3876, 512, 512, 512, 33708] + - [17, 12706.0] + - - [512, 33708, 1, 3906, 512, 512, 512, 33708] + - [17, 12708.0] + - - [512, 33708, 1, 3910, 512, 512, 512, 33708] + - [29, 12705.0] + - - [512, 33708, 1, 3925, 512, 512, 512, 33708] + - [17, 12709.0] + - - [512, 33708, 1, 3942, 512, 512, 512, 33708] + - [4, 12709.0] + - - [512, 33708, 1, 3944, 512, 512, 512, 33708] + - [29, 12704.0] + - - [512, 33708, 1, 3955, 512, 512, 512, 33708] + - [29, 12706.0] + - - [512, 33708, 1, 3969, 512, 512, 512, 33708] + - [29, 12709.0] + - - [512, 33708, 1, 3976, 512, 512, 512, 33708] + - [4, 12707.0] + - - [512, 33708, 1, 3977, 512, 512, 512, 33708] + - [4, 12708.0] + - - [512, 33708, 1, 3978, 512, 512, 512, 33708] + - [29, 12708.0] + - - [512, 33708, 1, 3990, 512, 512, 512, 33708] + - [29, 12707.0] + - - [512, 33708, 1, 3995, 512, 512, 512, 33708] + - [29, 12710.0] + - - [512, 33708, 1, 3996, 512, 512, 512, 33708] + - [17, 12716.0] + - - [512, 33708, 1, 3999, 512, 512, 512, 33708] + - [17, 12708.0] + - - [512, 33708, 1, 4005, 512, 512, 512, 33708] + - [29, 12707.0] + - - [512, 33708, 1, 4012, 512, 512, 512, 33708] + - [17, 12709.0] + - - [512, 33708, 1, 4020, 512, 512, 512, 33708] + - [4, 12708.0] + - - [512, 33708, 1, 4026, 512, 512, 512, 33708] + - [29, 12707.0] + - - [512, 33708, 1, 4032, 512, 512, 512, 33708] + - [29, 12708.0] + - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] + - [1, 11929.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [1, 11993.0] + - - [1024, 30522, 1, 20, 1024, 1024, 1024, 30522] + - [0, 5014.0] + - - [1024, 30522, 1, 80, 1024, 1024, 1024, 30522] + - [20, 10582.0] + - - [1024, 30522, 1, 120, 1024, 1024, 1024, 30522] + - [8, 11768.0] + - - [1024, 4096, 1, 3840, 1024, 1024, 1024, 4096] + - [4, 12376.0] + - - [1024, 4096, 1, 3968, 1024, 1024, 1024, 4096] + - [4, 12376.0] + - - [1024, 4096, 1, 7200, 1024, 1024, 1024, 4096] + - [4, 12402.0] + - - [1024, 4096, 1, 8160, 1024, 1024, 1024, 4096] + - [4, 12408.0] + - - [1024, 4096, 1, 9520, 1024, 1024, 1024, 4096] + - [29, 12407.0] + - - [1024, 4096, 1, 10200, 1024, 1024, 1024, 4096] + - [4, 12406.0] + - - [1024, 42720, 1, 3968, 1024, 1024, 1024, 42720] + - [4, 12798.0] + - - [1024, 42720, 1, 7200, 1024, 1024, 1024, 42720] + - [29, 12795.0] + - - [1024, 42720, 1, 9520, 1024, 1024, 1024, 42720] + - [4, 12800.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 4096, 1024] + - [17, 12373.0] + - - [4096, 1024, 1, 3968, 4096, 4096, 4096, 1024] + - [17, 12378.0] + - - [4096, 1024, 1, 7200, 4096, 4096, 4096, 1024] + - [17, 12394.0] + - - [4096, 1024, 1, 8160, 4096, 4096, 4096, 1024] + - [29, 12403.0] + - - [4096, 1024, 1, 9520, 4096, 4096, 4096, 1024] + - [29, 12408.0] + - - [4096, 1024, 1, 10200, 4096, 4096, 4096, 1024] + - [29, 12406.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [17, 12749.0] + - - [7744, 7744, 1, 7744, 7744, 7744, 7744, 7744] + - [17, 12582.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 1152] + - [14, 10111.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 1536] + - [0, 11185.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 1920] + - [25, 11561.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 2304] + - [1, 11726.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 2688] + - [26, 11954.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 3072] + - [11, 12382.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 3456] + - [29, 12382.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 3840] + - [4, 12586.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 4224] + - [29, 12509.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 4608] + - [4, 12588.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 4992] + - [26, 12511.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 5376] + - [4, 12562.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 5760] + - [4, 12603.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 6144] + - [17, 12621.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 6528] + - [17, 12633.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 6912] + - [29, 12695.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 7296] + - [17, 12688.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 7680] + - [4, 12722.0] + - - [1536, 768, 1, 384, 1536, 1536, 1536, 768] + - [18, 10258.0] + - - [1920, 960, 1, 384, 1920, 1920, 1920, 960] + - [25, 10869.0] + - - [2304, 1152, 1, 384, 2304, 2304, 2304, 1152] + - [14, 11159.0] + - - [2688, 1344, 1, 384, 2688, 2688, 2688, 1344] + - [25, 11393.0] + - - [3072, 1536, 1, 384, 3072, 3072, 3072, 1536] + - [25, 11778.0] + - - [3456, 1728, 1, 384, 3456, 3456, 3456, 1728] + - [25, 11773.0] + - - [3840, 1920, 1, 384, 3840, 3840, 3840, 1920] + - [4, 12133.0] + - - [4224, 2112, 1, 384, 4224, 4224, 4224, 2112] + - [0, 11895.0] + - - [4608, 2304, 1, 384, 4608, 4608, 4608, 2304] + - [4, 12295.0] + - - [4992, 2496, 1, 384, 4992, 4992, 4992, 2496] + - [1, 12109.0] + - - [5376, 2688, 1, 384, 5376, 5376, 5376, 2688] + - [4, 12370.0] + - - [5760, 2880, 1, 384, 5760, 5760, 5760, 2880] + - [4, 12244.0] + - - [6144, 3072, 1, 384, 6144, 6144, 6144, 3072] + - [23, 12532.0] + - - [6528, 3264, 1, 384, 6528, 6528, 6528, 3264] + - [4, 12255.0] + - - [6912, 3456, 1, 384, 6912, 6912, 6912, 3456] + - [29, 12618.0] + - - [7296, 3648, 1, 384, 7296, 7296, 7296, 3648] + - [17, 12360.0] + - - [7680, 3840, 1, 384, 7680, 7680, 7680, 3840] + - [17, 12652.0] + - - [768, 1536, 1, 384, 768, 768, 768, 1536] + - [30, 10291.0] + - - [1152, 2304, 1, 384, 1152, 1152, 1152, 2304] + - [0, 11198.0] + - - [1536, 3072, 1, 384, 1536, 1536, 1536, 3072] + - [14, 11781.0] + - - [1920, 3840, 1, 384, 1920, 1920, 1920, 3840] + - [17, 12151.0] + - - [2304, 4608, 1, 384, 2304, 2304, 2304, 4608] + - [4, 12303.0] + - - [2688, 5376, 1, 384, 2688, 2688, 2688, 5376] + - [4, 12358.0] + - - [3072, 6144, 1, 384, 3072, 3072, 3072, 6144] + - [4, 12515.0] + - - [3456, 6912, 1, 384, 3456, 3456, 3456, 6912] + - [17, 12630.0] + - - [3840, 7680, 1, 384, 3840, 3840, 3840, 7680] + - [4, 12657.0] + - - [4224, 8448, 1, 384, 4224, 4224, 4224, 8448] + - [29, 12683.0] + - - [4608, 9216, 1, 384, 4608, 4608, 4608, 9216] + - [4, 12664.0] + - - [4992, 9984, 1, 384, 4992, 4992, 4992, 9984] + - [29, 12662.0] + - - [5376, 10752, 1, 384, 5376, 5376, 5376, 10752] + - [17, 12682.0] + - - [5760, 11520, 1, 384, 5760, 5760, 5760, 11520] + - [4, 12706.0] + - - [6144, 12288, 1, 384, 6144, 6144, 6144, 12288] + - [17, 12704.0] + - - [6528, 13056, 1, 384, 6528, 6528, 6528, 13056] + - [17, 12716.0] + - - [6912, 13824, 1, 384, 6912, 6912, 6912, 13824] + - [4, 12729.0] + - - [7296, 14592, 1, 384, 7296, 7296, 7296, 14592] + - [17, 12751.0] + - - [7680, 15360, 1, 384, 7680, 7680, 7680, 15360] + - [29, 12753.0] + - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] + - [15, 12197.0] + - - [256, 10240, 1, 8976, 256, 256, 256, 10240] + - [19, 12467.0] + - - [256, 10496, 1, 8976, 256, 256, 256, 10496] + - [16, 11671.0] + - - [256, 11008, 1, 8976, 256, 256, 256, 11008] + - [5, 11925.0] + - - [256, 11264, 1, 8976, 256, 256, 256, 11264] + - [17, 12186.0] + - - [256, 11520, 1, 8976, 256, 256, 256, 11520] + - [17, 12446.0] + - - [256, 11776, 1, 8976, 256, 256, 256, 11776] + - [28, 11688.0] + - - [256, 12544, 1, 8976, 256, 256, 256, 12544] + - [19, 12289.0] + - - [256, 12800, 1, 8976, 256, 256, 256, 12800] + - [19, 12527.0] + - - [256, 13312, 1, 8976, 256, 256, 256, 13312] + - [28, 11963.0] + - - [256, 13568, 1, 8976, 256, 256, 256, 13568] + - [17, 12100.0] + - - [256, 14336, 1, 8976, 256, 256, 256, 14336] + - [28, 11794.0] + - - [256, 14848, 1, 8976, 256, 256, 256, 14848] + - [19, 12177.0] + - - [256, 15104, 1, 8976, 256, 256, 256, 15104] + - [19, 12368.0] + - - [256, 15872, 1, 8976, 256, 256, 256, 15872] + - [17, 12036.0] + - - [256, 16128, 1, 8976, 256, 256, 256, 16128] + - [17, 12220.0] + - - [256, 17152, 1, 8976, 256, 256, 256, 17152] + - [4, 12090.0] + - - [256, 17408, 1, 8976, 256, 256, 256, 17408] + - [29, 12265.0] + - - [256, 18688, 1, 8976, 256, 256, 256, 18688] + - [17, 12308.0] + - - [256, 19968, 1, 8976, 256, 256, 256, 19968] + - [4, 12339.0] + - - [256, 20480, 1, 8976, 256, 256, 256, 20480] + - [29, 12641.0] + - - [256, 20992, 1, 8976, 256, 256, 256, 20992] + - [4, 12231.0] + - - [256, 21248, 1, 8976, 256, 256, 256, 21248] + - [4, 12375.0] + - - [256, 22016, 1, 8976, 256, 256, 256, 22016] + - [17, 12130.0] + - - [256, 26112, 1, 8976, 256, 256, 256, 26112] + - [17, 12352.0] + - - [256, 32512, 1, 8976, 256, 256, 256, 32512] + - [4, 12453.0] + - - [256, 33536, 1, 8976, 256, 256, 256, 33536] + - [17, 12381.0] + - - [256, 4864, 1, 8976, 256, 256, 256, 4864] + - [19, 11619.0] + - - [256, 5120, 1, 8976, 256, 256, 256, 5120] + - [6, 12194.0] + - - [256, 5632, 1, 8976, 256, 256, 256, 5632] + - [3, 11525.0] + - - [256, 5888, 1, 8976, 256, 256, 256, 5888] + - [18, 11278.0] + - - [256, 6144, 1, 8976, 256, 256, 256, 6144] + - [5, 11744.0] + - - [256, 7168, 1, 8976, 256, 256, 256, 7168] + - [19, 11586.0] + - - [256, 8192, 1, 8976, 256, 256, 256, 8192] + - [3, 11763.0] + - - [256, 8960, 1, 8976, 256, 256, 256, 8960] + - [29, 12339.0] + - - [256, 9728, 1, 8976, 256, 256, 256, 9728] + - [19, 11878.0] + - - [256, 9984, 1, 8976, 256, 256, 256, 9984] + - [19, 12174.0] + - - [3200, 2048, 1, 1024, 3200, 3200, 3200, 2048] + - [15, 12559.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 4096] + - [17, 12569.0] + - - [512, 3280, 1, 1600, 512, 512, 512, 3280] + - [25, 10919.0] + - - [512, 3280, 1, 200, 512, 512, 512, 3280] + - [14, 10190.0] + - - [768, 2048, 1, 256, 768, 768, 768, 2048] + - [18, 10918.0] + - - [1600, 1024, 1, 960, 1600, 1600, 1600, 1024] + - [0, 10609.0] + - - [2048, 2048, 1, 960, 2048, 2048, 2048, 2048] + - [1, 12226.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] + - [1, 11800.0] + - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] + - [1, 11676.0] + - - [1024, 4096, 1, 2048, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 30528, 1, 2048, 1024, 1024, 1024, 30528] + - [17, 12734.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12379.0] + - - [1024, 30528, 1, 4096, 1024, 1024, 1024, 30528] + - [17, 12754.0] + - - [9216, 128, 1, 128, 9216, 9216, 9216, 128] + - [14, 9020.0] + - - [9600, 128, 1, 128, 9600, 9600, 9600, 128] + - [5, 9781.0] + - - [9984, 128, 1, 128, 9984, 9984, 9984, 128] + - [0, 10023.0] + - - [10368, 128, 1, 128, 10368, 10368, 10368, 128] + - [25, 9501.0] + - - [10752, 128, 1, 128, 10752, 10752, 10752, 128] + - [8, 9690.0] + - - [11136, 128, 1, 128, 11136, 11136, 11136, 128] + - [14, 10003.0] + - - [11520, 128, 1, 128, 11520, 11520, 11520, 128] + - [3, 9790.0] + - - [11904, 128, 1, 128, 11904, 11904, 11904, 128] + - [5, 10002.0] + - - [12288, 128, 1, 128, 12288, 12288, 12288, 128] + - [0, 10107.0] + - - [12672, 128, 1, 128, 12672, 12672, 12672, 128] + - [5, 10475.0] + - - [13056, 128, 1, 128, 13056, 13056, 13056, 128] + - [0, 10043.0] + - - [13440, 128, 1, 128, 13440, 13440, 13440, 128] + - [25, 10261.0] + - - [13824, 128, 1, 128, 13824, 13824, 13824, 128] + - [0, 10390.0] + - - [14208, 128, 1, 128, 14208, 14208, 14208, 128] + - [12, 10201.0] + - - [14592, 128, 1, 128, 14592, 14592, 14592, 128] + - [25, 10323.0] + - - [14976, 128, 1, 128, 14976, 14976, 14976, 128] + - [5, 10677.0] + - - [15360, 128, 1, 128, 15360, 15360, 15360, 128] + - [25, 10574.0] + - - [15744, 128, 1, 128, 15744, 15744, 15744, 128] + - [25, 10435.0] + - - [16128, 128, 1, 128, 16128, 16128, 16128, 128] + - [14, 10570.0] + - - [16512, 128, 1, 128, 16512, 16512, 16512, 128] + - [14, 10761.0] + - - [16896, 128, 1, 128, 16896, 16896, 16896, 128] + - [25, 10430.0] + - - [17280, 128, 1, 128, 17280, 17280, 17280, 128] + - [5, 10740.0] + - - [17664, 128, 1, 128, 17664, 17664, 17664, 128] + - [0, 10783.0] + - - [18048, 128, 1, 128, 18048, 18048, 18048, 128] + - [14, 10531.0] + - - [18432, 128, 1, 128, 18432, 18432, 18432, 128] + - [25, 10619.0] + - - [18816, 128, 1, 128, 18816, 18816, 18816, 128] + - [25, 10848.0] + - - [19200, 128, 1, 128, 19200, 19200, 19200, 128] + - [25, 10923.0] + - - [19584, 128, 1, 128, 19584, 19584, 19584, 128] + - [5, 10804.0] + - - [19968, 128, 1, 128, 19968, 19968, 19968, 128] + - [25, 10804.0] + - - [20352, 128, 1, 128, 20352, 20352, 20352, 128] + - [5, 11145.0] + - - [20736, 128, 1, 128, 20736, 20736, 20736, 128] + - [14, 10670.0] + - - [21120, 128, 1, 128, 21120, 21120, 21120, 128] + - [25, 10895.0] + - - [21504, 128, 1, 128, 21504, 21504, 21504, 128] + - [0, 10921.0] + - - [21888, 128, 1, 128, 21888, 21888, 21888, 128] + - [36, 10802.0] + - - [22272, 128, 1, 128, 22272, 22272, 22272, 128] + - [25, 10880.0] + - - [22656, 128, 1, 128, 22656, 22656, 22656, 128] + - [5, 11161.0] + - - [23040, 128, 1, 128, 23040, 23040, 23040, 128] + - [31, 10980.0] + - - [9216, 128, 1, 256, 9216, 9216, 9216, 128] + - [24, 10066.0] + - - [9600, 128, 1, 256, 9600, 9600, 9600, 128] + - [12, 10599.0] + - - [9984, 128, 1, 256, 9984, 9984, 9984, 128] + - [5, 10833.0] + - - [10368, 128, 1, 256, 10368, 10368, 10368, 128] + - [25, 10063.0] + - - [10752, 128, 1, 256, 10752, 10752, 10752, 128] + - [25, 10350.0] + - - [11136, 128, 1, 256, 11136, 11136, 11136, 128] + - [14, 10663.0] + - - [11520, 128, 1, 256, 11520, 11520, 11520, 128] + - [3, 10823.0] + - - [11904, 128, 1, 256, 11904, 11904, 11904, 128] + - [5, 10663.0] + - - [12288, 128, 1, 256, 12288, 12288, 12288, 128] + - [24, 10841.0] + - - [12672, 128, 1, 256, 12672, 12672, 12672, 128] + - [5, 11321.0] + - - [13056, 128, 1, 256, 13056, 13056, 13056, 128] + - [14, 10512.0] + - - [13440, 128, 1, 256, 13440, 13440, 13440, 128] + - [14, 10794.0] + - - [13824, 128, 1, 256, 13824, 13824, 13824, 128] + - [0, 10963.0] + - - [14208, 128, 1, 256, 14208, 14208, 14208, 128] + - [5, 10767.0] + - - [14592, 128, 1, 256, 14592, 14592, 14592, 128] + - [18, 10952.0] + - - [14976, 128, 1, 256, 14976, 14976, 14976, 128] + - [5, 11276.0] + - - [15360, 128, 1, 256, 15360, 15360, 15360, 128] + - [5, 11418.0] + - - [15744, 128, 1, 256, 15744, 15744, 15744, 128] + - [0, 10775.0] + - - [16128, 128, 1, 256, 16128, 16128, 16128, 128] + - [25, 11052.0] + - - [16512, 128, 1, 256, 16512, 16512, 16512, 128] + - [25, 11301.0] + - - [16896, 128, 1, 256, 16896, 16896, 16896, 128] + - [5, 11029.0] + - - [17280, 128, 1, 256, 17280, 17280, 17280, 128] + - [5, 11284.0] + - - [17664, 128, 1, 256, 17664, 17664, 17664, 128] + - [18, 11403.0] + - - [18048, 128, 1, 256, 18048, 18048, 18048, 128] + - [25, 10843.0] + - - [18432, 128, 1, 256, 18432, 18432, 18432, 128] + - [14, 11054.0] + - - [18816, 128, 1, 256, 18816, 18816, 18816, 128] + - [25, 11235.0] + - - [19200, 128, 1, 256, 19200, 19200, 19200, 128] + - [0, 11377.0] + - - [19584, 128, 1, 256, 19584, 19584, 19584, 128] + - [5, 11282.0] + - - [19968, 128, 1, 256, 19968, 19968, 19968, 128] + - [12, 11399.0] + - - [20352, 128, 1, 256, 20352, 20352, 20352, 128] + - [5, 11680.0] + - - [20736, 128, 1, 256, 20736, 20736, 20736, 128] + - [25, 11070.0] + - - [21120, 128, 1, 256, 21120, 21120, 21120, 128] + - [25, 11235.0] + - - [21504, 128, 1, 256, 21504, 21504, 21504, 128] + - [25, 11238.0] + - - [21888, 128, 1, 256, 21888, 21888, 21888, 128] + - [24, 11238.0] + - - [22272, 128, 1, 256, 22272, 22272, 22272, 128] + - [12, 11368.0] + - - [22656, 128, 1, 256, 22656, 22656, 22656, 128] + - [5, 11614.0] + - - [23040, 128, 1, 256, 23040, 23040, 23040, 128] + - [5, 11683.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 8064] + - [4, 12706.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 8448] + - [17, 12722.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 8832] + - [17, 12706.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 9216] + - [17, 12712.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 9600] + - [29, 12723.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 9984] + - [4, 12719.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 10368] + - [17, 12734.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 10752] + - [17, 12748.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 11136] + - [17, 12743.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 11520] + - [17, 12754.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 11904] + - [17, 12751.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 12288] + - [17, 12761.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 12672] + - [17, 12755.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 13056] + - [17, 12749.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 13440] + - [17, 12754.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 13824] + - [17, 12754.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 14208] + - [17, 12762.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 14592] + - [4, 12758.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 14976] + - [17, 12760.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 15360] + - [17, 12766.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 15744] + - [17, 12767.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 16128] + - [29, 12765.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 16512] + - [4, 12767.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 16896] + - [29, 12768.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 17280] + - [4, 12770.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 17664] + - [23, 12759.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 18048] + - [29, 12767.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 18432] + - [17, 12767.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 18816] + - [29, 12770.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 19200] + - [4, 12768.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 19584] + - [17, 12770.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 19968] + - [17, 12777.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 20352] + - [17, 12775.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 20736] + - [17, 12765.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 21120] + - [17, 12770.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 21504] + - [23, 12769.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 21888] + - [29, 12767.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 22272] + - [17, 12772.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 22656] + - [17, 12773.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 23040] + - [29, 12773.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [0, 10461.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [25, 11407.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [28, 11798.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [4, 12056.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [4, 12184.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [4, 12641.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [17, 12583.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [17, 12785.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [4, 12680.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [17, 12777.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [29, 12677.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [4, 12727.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [29, 12758.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [17, 12758.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [17, 12829.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [17, 12801.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [17, 12844.0] + - - [8064, 4032, 1, 384, 8064, 8064, 8064, 4032] + - [4, 12437.0] + - - [8448, 4224, 1, 384, 8448, 8448, 8448, 4224] + - [4, 12665.0] + - - [8832, 4416, 1, 384, 8832, 8832, 8832, 4416] + - [4, 12472.0] + - - [9216, 4608, 1, 384, 9216, 9216, 9216, 4608] + - [4, 12660.0] + - - [9600, 4800, 1, 384, 9600, 9600, 9600, 4800] + - [4, 12490.0] + - - [9984, 4992, 1, 384, 9984, 9984, 9984, 4992] + - [29, 12651.0] + - - [10368, 5184, 1, 384, 10368, 10368, 10368, 5184] + - [4, 12493.0] + - - [10752, 5376, 1, 384, 10752, 10752, 10752, 5376] + - [29, 12679.0] + - - [11136, 5568, 1, 384, 11136, 11136, 11136, 5568] + - [4, 12536.0] + - - [11520, 5760, 1, 384, 11520, 11520, 11520, 5760] + - [4, 12701.0] + - - [11904, 5952, 1, 384, 11904, 11904, 11904, 5952] + - [4, 12573.0] + - - [12288, 6144, 1, 384, 12288, 12288, 12288, 6144] + - [17, 12717.0] + - - [12672, 6336, 1, 384, 12672, 12672, 12672, 6336] + - [4, 12581.0] + - - [13056, 6528, 1, 384, 13056, 13056, 13056, 6528] + - [29, 12709.0] + - - [13440, 6720, 1, 384, 13440, 13440, 13440, 6720] + - [4, 12593.0] + - - [13824, 6912, 1, 384, 13824, 13824, 13824, 6912] + - [4, 12731.0] + - - [14208, 7104, 1, 384, 14208, 14208, 14208, 7104] + - [11, 12618.0] + - - [14592, 7296, 1, 384, 14592, 14592, 14592, 7296] + - [17, 12748.0] + - - [14976, 7488, 1, 384, 14976, 14976, 14976, 7488] + - [4, 12615.0] + - - [15360, 7680, 1, 384, 15360, 15360, 15360, 7680] + - [4, 12750.0] + - - [15744, 7872, 1, 384, 15744, 15744, 15744, 7872] + - [4, 12628.0] + - - [16128, 8064, 1, 384, 16128, 16128, 16128, 8064] + - [29, 12753.0] + - - [16512, 8256, 1, 384, 16512, 16512, 16512, 8256] + - [17, 12641.0] + - - [16896, 8448, 1, 384, 16896, 16896, 16896, 8448] + - [17, 12749.0] + - - [17280, 8640, 1, 384, 17280, 17280, 17280, 8640] + - [4, 12664.0] + - - [17664, 8832, 1, 384, 17664, 17664, 17664, 8832] + - [17, 12746.0] + - - [18048, 9024, 1, 384, 18048, 18048, 18048, 9024] + - [4, 12655.0] + - - [18432, 9216, 1, 384, 18432, 18432, 18432, 9216] + - [4, 12748.0] + - - [18816, 9408, 1, 384, 18816, 18816, 18816, 9408] + - [4, 12676.0] + - - [19200, 9600, 1, 384, 19200, 19200, 19200, 9600] + - [17, 12758.0] + - - [19584, 9792, 1, 384, 19584, 19584, 19584, 9792] + - [17, 12668.0] + - - [19968, 9984, 1, 384, 19968, 19968, 19968, 9984] + - [29, 12755.0] + - - [20352, 10176, 1, 384, 20352, 20352, 20352, 10176] + - [4, 12678.0] + - - [20736, 10368, 1, 384, 20736, 20736, 20736, 10368] + - [17, 12756.0] + - - [21120, 10560, 1, 384, 21120, 21120, 21120, 10560] + - [17, 12680.0] + - - [21504, 10752, 1, 384, 21504, 21504, 21504, 10752] + - [17, 12766.0] + - - [21888, 10944, 1, 384, 21888, 21888, 21888, 10944] + - [17, 12681.0] + - - [22272, 11136, 1, 384, 22272, 22272, 22272, 11136] + - [17, 12764.0] + - - [22656, 11328, 1, 384, 22656, 22656, 22656, 11328] + - [17, 12689.0] + - - [23040, 11520, 1, 384, 23040, 23040, 23040, 11520] + - [29, 12769.0] + - - [8064, 16128, 1, 384, 8064, 8064, 8064, 16128] + - [29, 12760.0] + - - [8448, 16896, 1, 384, 8448, 8448, 8448, 16896] + - [17, 12750.0] + - - [8832, 17664, 1, 384, 8832, 8832, 8832, 17664] + - [4, 12753.0] + - - [9216, 18432, 1, 384, 9216, 9216, 9216, 18432] + - [17, 12756.0] + - - [9600, 19200, 1, 384, 9600, 9600, 9600, 19200] + - [17, 12759.0] + - - [9984, 19968, 1, 384, 9984, 9984, 9984, 19968] + - [29, 12757.0] + - - [10368, 20736, 1, 384, 10368, 10368, 10368, 20736] + - [17, 12761.0] + - - [10752, 21504, 1, 384, 10752, 10752, 10752, 21504] + - [17, 12762.0] + - - [11136, 22272, 1, 384, 11136, 11136, 11136, 22272] + - [29, 12772.0] + - - [11520, 23040, 1, 384, 11520, 11520, 11520, 23040] + - [4, 12768.0] + - - [11904, 23808, 1, 384, 11904, 11904, 11904, 23808] + - [17, 12774.0] + - - [12288, 24576, 1, 384, 12288, 12288, 12288, 24576] + - [17, 12769.0] + - - [12672, 25344, 1, 384, 12672, 12672, 12672, 25344] + - [17, 12769.0] + - - [13056, 26112, 1, 384, 13056, 13056, 13056, 26112] + - [29, 12770.0] + - - [13440, 26880, 1, 384, 13440, 13440, 13440, 26880] + - [4, 12778.0] + - - [13824, 27648, 1, 384, 13824, 13824, 13824, 27648] + - [17, 12769.0] + - - [14208, 28416, 1, 384, 14208, 14208, 14208, 28416] + - [17, 12765.0] + - - [14592, 29184, 1, 384, 14592, 14592, 14592, 29184] + - [17, 12769.0] + - - [14976, 29952, 1, 384, 14976, 14976, 14976, 29952] + - [29, 12774.0] + - - [15360, 30720, 1, 384, 15360, 15360, 15360, 30720] + - [17, 12772.0] + - - [15744, 31488, 1, 384, 15744, 15744, 15744, 31488] + - [17, 12775.0] + - - [16128, 32256, 1, 384, 16128, 16128, 16128, 32256] + - [4, 12768.0] + - - [16512, 33024, 1, 384, 16512, 16512, 16512, 33024] + - [17, 12774.0] + - - [16896, 33792, 1, 384, 16896, 16896, 16896, 33792] + - [35, 12771.0] + - - [17280, 34560, 1, 384, 17280, 17280, 17280, 34560] + - [17, 12766.0] + - - [17664, 35328, 1, 384, 17664, 17664, 17664, 35328] + - [17, 12767.0] + - - [18048, 36096, 1, 384, 18048, 18048, 18048, 36096] + - [29, 12769.0] + - - [18432, 36864, 1, 384, 18432, 18432, 18432, 36864] + - [17, 12768.0] + - - [18816, 37632, 1, 384, 18816, 18816, 18816, 37632] + - [29, 12769.0] + - - [19200, 38400, 1, 384, 19200, 19200, 19200, 38400] + - [17, 12767.0] + - - [19584, 39168, 1, 384, 19584, 19584, 19584, 39168] + - [17, 12768.0] + - - [19968, 39936, 1, 384, 19968, 19968, 19968, 39936] + - [35, 12766.0] + - - [20352, 40704, 1, 384, 20352, 20352, 20352, 40704] + - [17, 12770.0] + - - [20736, 41472, 1, 384, 20736, 20736, 20736, 41472] + - [17, 12765.0] + - - [21120, 42240, 1, 384, 21120, 21120, 21120, 42240] + - [17, 12767.0] + - - [21504, 43008, 1, 384, 21504, 21504, 21504, 43008] + - [17, 12769.0] + - - [21888, 43776, 1, 384, 21888, 21888, 21888, 43776] + - [17, 12771.0] + - - [22272, 44544, 1, 384, 22272, 22272, 22272, 44544] + - [17, 12772.0] + - - [22656, 45312, 1, 384, 22656, 22656, 22656, 45312] + - [29, 12763.0] + - - [23040, 46080, 1, 384, 23040, 23040, 23040, 46080] + - [29, 12764.0] + - - [1152, 1536, 1, 384, 1152, 1152, 1152, 1536] + - [14, 11139.0] + - - [1920, 1536, 1, 384, 1920, 1920, 1920, 1536] + - [30, 11986.0] + - - [2304, 1536, 1, 384, 2304, 2304, 2304, 1536] + - [15, 11856.0] + - - [2688, 1536, 1, 384, 2688, 2688, 2688, 1536] + - [15, 11814.0] + - - [3456, 1536, 1, 384, 3456, 3456, 3456, 1536] + - [26, 11779.0] + - - [3840, 1536, 1, 384, 3840, 3840, 3840, 1536] + - [1, 12203.0] + - - [4224, 1536, 1, 384, 4224, 4224, 4224, 1536] + - [26, 12239.0] + - - [4608, 1536, 1, 384, 4608, 4608, 4608, 1536] + - [17, 12160.0] + - - [4992, 1536, 1, 384, 4992, 4992, 4992, 1536] + - [17, 12136.0] + - - [5376, 1536, 1, 384, 5376, 5376, 5376, 1536] + - [29, 12087.0] + - - [5760, 1536, 1, 384, 5760, 5760, 5760, 1536] + - [4, 12446.0] + - - [6144, 1536, 1, 384, 6144, 6144, 6144, 1536] + - [17, 12394.0] + - - [6528, 1536, 1, 384, 6528, 6528, 6528, 1536] + - [29, 12356.0] + - - [6912, 1536, 1, 384, 6912, 6912, 6912, 1536] + - [17, 12311.0] + - - [7296, 1536, 1, 384, 7296, 7296, 7296, 1536] + - [11, 12247.0] + - - [7680, 1536, 1, 384, 7680, 7680, 7680, 1536] + - [17, 12532.0] + - - [8064, 1536, 1, 384, 8064, 8064, 8064, 1536] + - [29, 12493.0] + - - [8448, 1536, 1, 384, 8448, 8448, 8448, 1536] + - [17, 12439.0] + - - [8832, 1536, 1, 384, 8832, 8832, 8832, 1536] + - [29, 12416.0] + - - [9216, 1536, 1, 384, 9216, 9216, 9216, 1536] + - [17, 12379.0] + - - [9600, 1536, 1, 384, 9600, 9600, 9600, 1536] + - [11, 12584.0] + - - [9984, 1536, 1, 384, 9984, 9984, 9984, 1536] + - [29, 12552.0] + - - [10368, 1536, 1, 384, 10368, 10368, 10368, 1536] + - [29, 12518.0] + - - [10752, 1536, 1, 384, 10752, 10752, 10752, 1536] + - [4, 12478.0] + - - [11136, 1536, 1, 384, 11136, 11136, 11136, 1536] + - [17, 12451.0] + - - [11520, 1536, 1, 384, 11520, 11520, 11520, 1536] + - [17, 12596.0] + - - [11904, 1536, 1, 384, 11904, 11904, 11904, 1536] + - [29, 12591.0] + - - [12288, 1536, 1, 384, 12288, 12288, 12288, 1536] + - [17, 12546.0] + - - [12672, 1536, 1, 384, 12672, 12672, 12672, 1536] + - [17, 12487.0] + - - [13056, 1536, 1, 384, 13056, 13056, 13056, 1536] + - [17, 12475.0] + - - [13440, 1536, 1, 384, 13440, 13440, 13440, 1536] + - [17, 12622.0] + - - [13824, 1536, 1, 384, 13824, 13824, 13824, 1536] + - [4, 12577.0] + - - [14208, 1536, 1, 384, 14208, 14208, 14208, 1536] + - [17, 12560.0] + - - [14592, 1536, 1, 384, 14592, 14592, 14592, 1536] + - [4, 12528.0] + - - [14976, 1536, 1, 384, 14976, 14976, 14976, 1536] + - [17, 12511.0] + - - [15360, 1536, 1, 384, 15360, 15360, 15360, 1536] + - [4, 12629.0] + - - [15744, 1536, 1, 384, 15744, 15744, 15744, 1536] + - [29, 12611.0] + - - [16128, 1536, 1, 384, 16128, 16128, 16128, 1536] + - [29, 12583.0] + - - [16512, 1536, 1, 384, 16512, 16512, 16512, 1536] + - [17, 12562.0] + - - [16896, 1536, 1, 384, 16896, 16896, 16896, 1536] + - [17, 12540.0] + - - [17280, 1536, 1, 384, 17280, 17280, 17280, 1536] + - [11, 12658.0] + - - [17664, 1536, 1, 384, 17664, 17664, 17664, 1536] + - [29, 12621.0] + - - [18048, 1536, 1, 384, 18048, 18048, 18048, 1536] + - [4, 12611.0] + - - [18432, 1536, 1, 384, 18432, 18432, 18432, 1536] + - [4, 12588.0] + - - [18816, 1536, 1, 384, 18816, 18816, 18816, 1536] + - [29, 12572.0] + - - [19200, 1536, 1, 384, 19200, 19200, 19200, 1536] + - [4, 12668.0] + - - [19584, 1536, 1, 384, 19584, 19584, 19584, 1536] + - [17, 12653.0] + - - [19968, 1536, 1, 384, 19968, 19968, 19968, 1536] + - [4, 12620.0] + - - [20352, 1536, 1, 384, 20352, 20352, 20352, 1536] + - [17, 12608.0] + - - [20736, 1536, 1, 384, 20736, 20736, 20736, 1536] + - [17, 12592.0] + - - [21120, 1536, 1, 384, 21120, 21120, 21120, 1536] + - [29, 12682.0] + - - [21504, 1536, 1, 384, 21504, 21504, 21504, 1536] + - [4, 12648.0] + - - [21888, 1536, 1, 384, 21888, 21888, 21888, 1536] + - [17, 12642.0] + - - [22272, 1536, 1, 384, 22272, 22272, 22272, 1536] + - [17, 12611.0] + - - [22656, 1536, 1, 384, 22656, 22656, 22656, 1536] + - [17, 12609.0] + - - [23040, 1536, 1, 384, 23040, 23040, 23040, 1536] + - [29, 12681.0] + - - [768, 1920, 1, 384, 768, 768, 768, 1920] + - [25, 11116.0] + - - [1152, 1920, 1, 384, 1152, 1152, 1152, 1920] + - [5, 11478.0] + - - [1536, 1920, 1, 384, 1536, 1536, 1536, 1920] + - [30, 11956.0] + - - [2304, 1920, 1, 384, 2304, 2304, 2304, 1920] + - [25, 11831.0] + - - [2688, 1920, 1, 384, 2688, 2688, 2688, 1920] + - [15, 12098.0] + - - [3072, 1920, 1, 384, 3072, 3072, 3072, 1920] + - [17, 12279.0] + - - [3456, 1920, 1, 384, 3456, 3456, 3456, 1920] + - [1, 11956.0] + - - [4224, 1920, 1, 384, 4224, 4224, 4224, 1920] + - [26, 12307.0] + - - [4608, 1920, 1, 384, 4608, 4608, 4608, 1920] + - [17, 12446.0] + - - [4992, 1920, 1, 384, 4992, 4992, 4992, 1920] + - [17, 12206.0] + - - [5376, 1920, 1, 384, 5376, 5376, 5376, 1920] + - [4, 12309.0] + - - [5760, 1920, 1, 384, 5760, 5760, 5760, 1920] + - [29, 12438.0] + - - [6144, 1920, 1, 384, 6144, 6144, 6144, 1920] + - [17, 12527.0] + - - [6528, 1920, 1, 384, 6528, 6528, 6528, 1920] + - [17, 12346.0] + - - [6912, 1920, 1, 384, 6912, 6912, 6912, 1920] + - [1, 12381.0] + - - [7296, 1920, 1, 384, 7296, 7296, 7296, 1920] + - [17, 12520.0] + - - [7680, 1920, 1, 384, 7680, 7680, 7680, 1920] + - [4, 12581.0] + - - [8064, 1920, 1, 384, 8064, 8064, 8064, 1920] + - [29, 12381.0] + - - [8448, 1920, 1, 384, 8448, 8448, 8448, 1920] + - [17, 12499.0] + - - [8832, 1920, 1, 384, 8832, 8832, 8832, 1920] + - [17, 12570.0] + - - [9216, 1920, 1, 384, 9216, 9216, 9216, 1920] + - [23, 12597.0] + - - [9600, 1920, 1, 384, 9600, 9600, 9600, 1920] + - [17, 12480.0] + - - [9984, 1920, 1, 384, 9984, 9984, 9984, 1920] + - [29, 12493.0] + - - [10368, 1920, 1, 384, 10368, 10368, 10368, 1920] + - [17, 12602.0] + - - [10752, 1920, 1, 384, 10752, 10752, 10752, 1920] + - [17, 12600.0] + - - [11136, 1920, 1, 384, 11136, 11136, 11136, 1920] + - [17, 12513.0] + - - [11520, 1920, 1, 384, 11520, 11520, 11520, 1920] + - [35, 12533.0] + - - [11904, 1920, 1, 384, 11904, 11904, 11904, 1920] + - [17, 12603.0] + - - [12288, 1920, 1, 384, 12288, 12288, 12288, 1920] + - [4, 12622.0] + - - [12672, 1920, 1, 384, 12672, 12672, 12672, 1920] + - [4, 12541.0] + - - [13056, 1920, 1, 384, 13056, 13056, 13056, 1920] + - [17, 12562.0] + - - [13440, 1920, 1, 384, 13440, 13440, 13440, 1920] + - [17, 12616.0] + - - [13824, 1920, 1, 384, 13824, 13824, 13824, 1920] + - [4, 12647.0] + - - [14208, 1920, 1, 384, 14208, 14208, 14208, 1920] + - [17, 12562.0] + - - [14592, 1920, 1, 384, 14592, 14592, 14592, 1920] + - [4, 12590.0] + - - [14976, 1920, 1, 384, 14976, 14976, 14976, 1920] + - [17, 12634.0] + - - [15360, 1920, 1, 384, 15360, 15360, 15360, 1920] + - [4, 12657.0] + - - [15744, 1920, 1, 384, 15744, 15744, 15744, 1920] + - [4, 12581.0] + - - [16128, 1920, 1, 384, 16128, 16128, 16128, 1920] + - [4, 12605.0] + - - [16512, 1920, 1, 384, 16512, 16512, 16512, 1920] + - [4, 12650.0] + - - [16896, 1920, 1, 384, 16896, 16896, 16896, 1920] + - [4, 12669.0] + - - [17280, 1920, 1, 384, 17280, 17280, 17280, 1920] + - [17, 12601.0] + - - [17664, 1920, 1, 384, 17664, 17664, 17664, 1920] + - [4, 12630.0] + - - [18048, 1920, 1, 384, 18048, 18048, 18048, 1920] + - [17, 12662.0] + - - [18432, 1920, 1, 384, 18432, 18432, 18432, 1920] + - [4, 12678.0] + - - [18816, 1920, 1, 384, 18816, 18816, 18816, 1920] + - [17, 12620.0] + - - [19200, 1920, 1, 384, 19200, 19200, 19200, 1920] + - [17, 12643.0] + - - [19584, 1920, 1, 384, 19584, 19584, 19584, 1920] + - [17, 12671.0] + - - [19968, 1920, 1, 384, 19968, 19968, 19968, 1920] + - [17, 12690.0] + - - [20352, 1920, 1, 384, 20352, 20352, 20352, 1920] + - [17, 12631.0] + - - [20736, 1920, 1, 384, 20736, 20736, 20736, 1920] + - [29, 12650.0] + - - [21120, 1920, 1, 384, 21120, 21120, 21120, 1920] + - [29, 12683.0] + - - [21504, 1920, 1, 384, 21504, 21504, 21504, 1920] + - [17, 12695.0] + - - [21888, 1920, 1, 384, 21888, 21888, 21888, 1920] + - [29, 12642.0] + - - [22272, 1920, 1, 384, 22272, 22272, 22272, 1920] + - [17, 12659.0] + - - [22656, 1920, 1, 384, 22656, 22656, 22656, 1920] + - [17, 12690.0] + - - [23040, 1920, 1, 384, 23040, 23040, 23040, 1920] + - [17, 12701.0] + - - [768, 2304, 1, 384, 768, 768, 768, 2304] + - [14, 11157.0] + - - [1536, 2304, 1, 384, 1536, 1536, 1536, 2304] + - [18, 11856.0] + - - [1920, 2304, 1, 384, 1920, 1920, 1920, 2304] + - [25, 11834.0] + - - [2688, 2304, 1, 384, 2688, 2688, 2688, 2304] + - [26, 12190.0] + - - [3072, 2304, 1, 384, 3072, 3072, 3072, 2304] + - [4, 12156.0] + - - [3456, 2304, 1, 384, 3456, 3456, 3456, 2304] + - [29, 12122.0] + - - [3840, 2304, 1, 384, 3840, 3840, 3840, 2304] + - [17, 12425.0] + - - [4224, 2304, 1, 384, 4224, 4224, 4224, 2304] + - [26, 12371.0] + - - [4992, 2304, 1, 384, 4992, 4992, 4992, 2304] + - [35, 12238.0] + - - [5376, 2304, 1, 384, 5376, 5376, 5376, 2304] + - [17, 12471.0] + - - [5760, 2304, 1, 384, 5760, 5760, 5760, 2304] + - [17, 12436.0] + - - [6144, 2304, 1, 384, 6144, 6144, 6144, 2304] + - [4, 12371.0] + - - [6528, 2304, 1, 384, 6528, 6528, 6528, 2304] + - [17, 12568.0] + - - [6912, 2304, 1, 384, 6912, 6912, 6912, 2304] + - [4, 12507.0] + - - [7296, 2304, 1, 384, 7296, 7296, 7296, 2304] + - [32, 12437.0] + - - [7680, 2304, 1, 384, 7680, 7680, 7680, 2304] + - [17, 12619.0] + - - [8064, 2304, 1, 384, 8064, 8064, 8064, 2304] + - [4, 12573.0] + - - [8448, 2304, 1, 384, 8448, 8448, 8448, 2304] + - [29, 12520.0] + - - [8832, 2304, 1, 384, 8832, 8832, 8832, 2304] + - [4, 12465.0] + - - [9216, 2304, 1, 384, 9216, 9216, 9216, 2304] + - [17, 12581.0] + - - [9600, 2304, 1, 384, 9600, 9600, 9600, 2304] + - [4, 12552.0] + - - [9984, 2304, 1, 384, 9984, 9984, 9984, 2304] + - [17, 12522.0] + - - [10368, 2304, 1, 384, 10368, 10368, 10368, 2304] + - [17, 12626.0] + - - [10752, 2304, 1, 384, 10752, 10752, 10752, 2304] + - [11, 12584.0] + - - [11136, 2304, 1, 384, 11136, 11136, 11136, 2304] + - [17, 12547.0] + - - [11520, 2304, 1, 384, 11520, 11520, 11520, 2304] + - [4, 12657.0] + - - [11904, 2304, 1, 384, 11904, 11904, 11904, 2304] + - [29, 12622.0] + - - [12288, 2304, 1, 384, 12288, 12288, 12288, 2304] + - [4, 12572.0] + - - [12672, 2304, 1, 384, 12672, 12672, 12672, 2304] + - [17, 12557.0] + - - [13056, 2304, 1, 384, 13056, 13056, 13056, 2304] + - [17, 12640.0] + - - [13440, 2304, 1, 384, 13440, 13440, 13440, 2304] + - [4, 12618.0] + - - [13824, 2304, 1, 384, 13824, 13824, 13824, 2304] + - [4, 12580.0] + - - [14208, 2304, 1, 384, 14208, 14208, 14208, 2304] + - [17, 12671.0] + - - [14592, 2304, 1, 384, 14592, 14592, 14592, 2304] + - [29, 12629.0] + - - [14976, 2304, 1, 384, 14976, 14976, 14976, 2304] + - [29, 12617.0] + - - [15360, 2304, 1, 384, 15360, 15360, 15360, 2304] + - [17, 12683.0] + - - [15744, 2304, 1, 384, 15744, 15744, 15744, 2304] + - [4, 12671.0] + - - [16128, 2304, 1, 384, 16128, 16128, 16128, 2304] + - [17, 12635.0] + - - [16512, 2304, 1, 384, 16512, 16512, 16512, 2304] + - [17, 12624.0] + - - [16896, 2304, 1, 384, 16896, 16896, 16896, 2304] + - [17, 12681.0] + - - [17280, 2304, 1, 384, 17280, 17280, 17280, 2304] + - [29, 12662.0] + - - [17664, 2304, 1, 384, 17664, 17664, 17664, 2304] + - [11, 12628.0] + - - [18048, 2304, 1, 384, 18048, 18048, 18048, 2304] + - [4, 12694.0] + - - [18432, 2304, 1, 384, 18432, 18432, 18432, 2304] + - [4, 12665.0] + - - [18816, 2304, 1, 384, 18816, 18816, 18816, 2304] + - [29, 12652.0] + - - [19200, 2304, 1, 384, 19200, 19200, 19200, 2304] + - [29, 12699.0] + - - [19584, 2304, 1, 384, 19584, 19584, 19584, 2304] + - [29, 12684.0] + - - [19968, 2304, 1, 384, 19968, 19968, 19968, 2304] + - [4, 12662.0] + - - [20352, 2304, 1, 384, 20352, 20352, 20352, 2304] + - [17, 12649.0] + - - [20736, 2304, 1, 384, 20736, 20736, 20736, 2304] + - [4, 12693.0] + - - [21120, 2304, 1, 384, 21120, 21120, 21120, 2304] + - [17, 12683.0] + - - [21504, 2304, 1, 384, 21504, 21504, 21504, 2304] + - [4, 12657.0] + - - [21888, 2304, 1, 384, 21888, 21888, 21888, 2304] + - [17, 12711.0] + - - [22272, 2304, 1, 384, 22272, 22272, 22272, 2304] + - [17, 12685.0] + - - [22656, 2304, 1, 384, 22656, 22656, 22656, 2304] + - [29, 12673.0] + - - [23040, 2304, 1, 384, 23040, 23040, 23040, 2304] + - [4, 12710.0] + - - [256, 32768, 1, 1, 256, 256, 256, 32768] + - [0, 261.0] + - - [289, 128, 64, 768, 289, 289, 289, 128] + - [18, 8780.0] + - - [289, 160, 64, 768, 289, 289, 289, 160] + - [25, 7414.0] + - - [289, 192, 64, 768, 289, 289, 289, 192] + - [25, 8895.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 256] + - [0, 11317.0] + - - [784, 512, 32, 128, 784, 784, 784, 512] + - [18, 10358.0] + - - [784, 128, 32, 512, 784, 784, 784, 128] + - [25, 10093.0] + - - [196, 1024, 32, 256, 196, 196, 196, 1024] + - [30, 8978.0] + - - [1444, 128, 120, 256, 1444, 1444, 1444, 128] + - [29, 11452.0] + - - [1444, 128, 18, 256, 1444, 1444, 1444, 128] + - [5, 10825.0] + - - [1444, 128, 19, 256, 1444, 1444, 1444, 128] + - [25, 10863.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [17, 11702.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [30, 11128.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [1, 11300.0] + - - [361, 512, 120, 256, 361, 361, 361, 512] + - [4, 11289.0] + - - [361, 512, 18, 256, 361, 361, 361, 512] + - [14, 10588.0] + - - [361, 512, 19, 256, 361, 361, 361, 512] + - [3, 10505.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [4, 12841.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [29, 12788.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [15, 12491.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 7680] + - [17, 12850.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 3840] + - [17, 12790.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 1920] + - [15, 12472.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [17, 12839.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [4, 12608.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 12319.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 4096] + - [17, 12062.0] + - - [1024, 30522, 1, 77, 1024, 1024, 1024, 30522] + - [0, 11476.0] + - - [4096, 1024, 1, 512, 4096, 4096, 4096, 1024] + - [1, 11969.0] + - - [1024, 4096, 1, 1280, 1024, 1024, 1024, 4096] + - [4, 12290.0] + - - [1024, 30522, 1, 200, 1024, 1024, 1024, 30522] + - [21, 12275.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 4096, 1024] + - [15, 12270.0] + - - [1024, 4096, 1, 4992, 1024, 1024, 1024, 4096] + - [4, 12388.0] + - - [1024, 30522, 1, 780, 1024, 1024, 1024, 30522] + - [4, 12637.0] + - - [4096, 1024, 1, 4992, 4096, 4096, 4096, 1024] + - [17, 12386.0] + - - [1024, 30522, 1, 308, 1024, 1024, 1024, 30522] + - [17, 12447.0] + - - [1024, 4096, 1, 5120, 1024, 1024, 1024, 4096] + - [17, 12387.0] + - - [1024, 30522, 1, 800, 1024, 1024, 1024, 30522] + - [17, 12663.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 4096, 1024] + - [17, 12387.0] + - - [1024, 4096, 1, 5248, 1024, 1024, 1024, 4096] + - [4, 12389.0] + - - [1024, 30522, 1, 820, 1024, 1024, 1024, 30522] + - [17, 12650.0] + - - [4096, 1024, 1, 5248, 4096, 4096, 4096, 1024] + - [17, 12392.0] + - - [1024, 4096, 1, 2560, 1024, 1024, 1024, 4096] + - [15, 12341.0] + - - [1024, 30522, 1, 385, 1024, 1024, 1024, 30522] + - [17, 12517.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 1024] + - [15, 12341.0] + - - [1024, 30522, 1, 462, 1024, 1024, 1024, 30522] + - [17, 12559.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 4096] + - [1, 12206.0] + - - [1024, 30522, 1, 160, 1024, 1024, 1024, 30522] + - [18, 12071.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [15, 12205.0] + - - [1024, 4096, 1, 1152, 1024, 1024, 1024, 4096] + - [23, 12251.0] + - - [1024, 30522, 1, 180, 1024, 1024, 1024, 30522] + - [17, 12127.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 4096, 1024] + - [1, 12227.0] + - - [1024, 4096, 1, 8192, 1024, 1024, 1024, 4096] + - [17, 12405.0] + - - [1024, 4096, 1, 9600, 1024, 1024, 1024, 4096] + - [29, 12405.0] + - - [1024, 33712, 1, 8192, 1024, 1024, 1024, 33712] + - [29, 12759.0] + - - [1024, 33712, 1, 9600, 1024, 1024, 1024, 33712] + - [17, 12761.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 4096, 1024] + - [17, 12402.0] + - - [4096, 1024, 1, 9600, 4096, 4096, 4096, 1024] + - [17, 12406.0] + - - [1024, 4096, 1, 10064, 1024, 1024, 1024, 4096] + - [4, 12408.0] + - - [1024, 4096, 1, 10080, 1024, 1024, 1024, 4096] + - [29, 12408.0] + - - [1024, 4096, 1, 6528, 1024, 1024, 1024, 4096] + - [17, 12398.0] + - - [1024, 4096, 1, 7104, 1024, 1024, 1024, 4096] + - [4, 12399.0] + - - [1024, 4096, 1, 8064, 1024, 1024, 1024, 4096] + - [29, 12400.0] + - - [1024, 4096, 1, 9216, 1024, 1024, 1024, 4096] + - [29, 12403.0] + - - [1024, 42720, 1, 10080, 1024, 1024, 1024, 42720] + - [4, 12798.0] + - - [1024, 42720, 1, 6528, 1024, 1024, 1024, 42720] + - [29, 12799.0] + - - [1024, 42720, 1, 7104, 1024, 1024, 1024, 42720] + - [17, 12794.0] + - - [4096, 1024, 1, 10064, 4096, 4096, 4096, 1024] + - [17, 12407.0] + - - [4096, 1024, 1, 10080, 4096, 4096, 4096, 1024] + - [17, 12407.0] + - - [4096, 1024, 1, 6528, 4096, 4096, 4096, 1024] + - [17, 12397.0] + - - [4096, 1024, 1, 7104, 4096, 4096, 4096, 1024] + - [17, 12400.0] + - - [4096, 1024, 1, 8064, 4096, 4096, 4096, 1024] + - [17, 12402.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 4096, 1024] + - [4, 12402.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1600] + - [20, 211.0] + - - [2048, 960, 1, 1, 2048, 2048, 2048, 960] + - [20, 236.0] + - - [2048, 2048, 1, 2, 2048, 2048, 2048, 2048] + - [8, 525.0] + - - [2048, 30592, 1, 1024, 2048, 2048, 2048, 30592] + - [4, 12770.0] + - - [2048, 6144, 1, 1024, 2048, 2048, 2048, 6144] + - [17, 12507.0] + - - [2048, 8192, 1, 1024, 2048, 2048, 2048, 8192] + - [4, 12552.0] + - - [8192, 2048, 1, 1024, 8192, 8192, 8192, 2048] + - [17, 12546.0] + - - [1024, 30592, 1, 8192, 1024, 1024, 1024, 30592] + - [17, 12791.0] + - - [1024, 3072, 1, 8192, 1024, 1024, 1024, 3072] + - [6, 12045.0] + - - [1024, 30592, 1, 2048, 1024, 1024, 1024, 30592] + - [4, 12769.0] + - - [1024, 30592, 1, 4096, 1024, 1024, 1024, 30592] + - [17, 12784.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 3072] + - [6, 12000.0] + - - [2560, 1920, 1, 2048, 2560, 2560, 2560, 1920] + - [17, 12549.0] + - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] + - [17, 12654.0] + - - [2560, 2560, 1, 4, 2560, 2560, 2560, 2560] + - [22, 1050.0] + - - [2560, 7680, 1, 2048, 2560, 2560, 2560, 7680] + - [17, 12793.0] + - - [640, 2560, 1, 2048, 640, 640, 640, 2560] + - [30, 12060.0] + - - [1536, 1536, 1, 4096, 1536, 1536, 1536, 1536] + - [25, 11506.0] + - - [1536, 4608, 1, 4096, 1536, 1536, 1536, 4608] + - [4, 12473.0] + - - [1536, 50304, 1, 4096, 1536, 1536, 1536, 50304] + - [29, 12838.0] + - - [1536, 6144, 1, 4096, 1536, 1536, 1536, 6144] + - [17, 12650.0] + - - [6144, 1536, 1, 4096, 6144, 6144, 6144, 1536] + - [17, 12650.0] + - - [1536, 1536, 1, 8192, 1536, 1536, 1536, 1536] + - [28, 11532.0] + - - [1536, 4608, 1, 8192, 1536, 1536, 1536, 4608] + - [17, 12488.0] + - - [1536, 50304, 1, 8192, 1536, 1536, 1536, 50304] + - [17, 12842.0] + - - [1536, 6144, 1, 8192, 1536, 1536, 1536, 6144] + - [17, 12668.0] + - - [6144, 1536, 1, 8192, 6144, 6144, 6144, 1536] + - [17, 12667.0] + - - [1024, 3072, 1, 16384, 1024, 1024, 1024, 3072] + - [6, 12071.0] + - - [1024, 4096, 1, 16384, 1024, 1024, 1024, 4096] + - [17, 12417.0] + - - [1024, 50304, 1, 16384, 1024, 1024, 1024, 50304] + - [29, 12778.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 4096, 1024] + - [17, 12416.0] + - - [1024, 50304, 1, 2048, 1024, 1024, 1024, 50304] + - [4, 12785.0] + - - [1024, 50304, 1, 4096, 1024, 1024, 1024, 50304] + - [4, 12781.0] + - - [1024, 50304, 1, 8192, 1024, 1024, 1024, 50304] + - [29, 12780.0] + - - [1024, 30528, 1, 8192, 1024, 1024, 1024, 30528] + - [17, 12755.0] + - - [256, 6912, 1, 1, 256, 256, 256, 6912] + - [3, 207.0] + - - [30528, 1024, 1, 640, 30528, 30528, 30528, 1024] + - [4, 12628.0] + - - [30528, 1024, 1, 1280, 30528, 30528, 30528, 1024] + - [17, 12695.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 4096, 1024] + - [29, 12406.0] + - - [1024, 4096, 1, 10240, 1024, 1024, 1024, 4096] + - [4, 12407.0] + - - [30528, 1024, 1, 1600, 30528, 30528, 30528, 1024] + - [29, 12712.0] + - - [1024, 4096, 1, 10496, 1024, 1024, 1024, 4096] + - [29, 12407.0] + - - [30528, 1024, 1, 1640, 30528, 30528, 30528, 1024] + - [29, 12714.0] + - - [4096, 1024, 1, 10496, 4096, 4096, 4096, 1024] + - [29, 12408.0] + - - [30528, 1024, 1, 160, 30528, 30528, 30528, 1024] + - [21, 12180.0] + - - [1024, 4096, 1, 6144, 1024, 1024, 1024, 4096] + - [17, 12397.0] + - - [30528, 1024, 1, 240, 30528, 30528, 30528, 1024] + - [9, 12427.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 4096, 1024] + - [29, 12393.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 128] + - [29, 12056.0] + - - [784, 256, 64, 512, 784, 784, 784, 256] + - [15, 10908.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 256] + - [1, 11898.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [23, 12269.0] + - - [196, 512, 64, 1024, 196, 196, 196, 512] + - [17, 9429.0] + - - [784, 512, 64, 256, 784, 784, 784, 512] + - [26, 10852.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [29, 11022.0] + - - [196, 1024, 64, 512, 196, 196, 196, 1024] + - [15, 9411.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [17, 9545.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 128] + - [30, 11894.0] + - - [784, 256, 32, 512, 784, 784, 784, 256] + - [15, 10556.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 256] + - [5, 11751.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [1, 12114.0] + - - [196, 512, 32, 1024, 196, 196, 196, 512] + - [29, 9224.0] + - - [784, 512, 32, 256, 784, 784, 784, 512] + - [1, 10722.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [26, 10931.0] + - - [196, 1024, 32, 512, 196, 196, 196, 1024] + - [1, 9271.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [1, 9436.0] + - - [1024, 4096, 1, 10224, 1024, 1024, 1024, 4096] + - [17, 12408.0] + - - [4096, 1024, 1, 10224, 4096, 4096, 4096, 1024] + - [17, 12408.0] + - - [1024, 3072, 1, 10224, 1024, 1024, 1024, 3072] + - [6, 12060.0] + - - [1024, 3072, 1, 10240, 1024, 1024, 1024, 3072] + - [6, 12056.0] + - - [4096, 1024, 1, 10192, 4096, 4096, 4096, 1024] + - [17, 12410.0] + - - [1024, 3072, 1, 10192, 1024, 1024, 1024, 3072] + - [6, 12057.0] + - - [1024, 4096, 1, 10192, 1024, 1024, 1024, 4096] + - [29, 12409.0] + - - [1024, 3072, 1, 10200, 1024, 1024, 1024, 3072] + - [6, 12053.0] + - - [4096, 1024, 1, 10208, 4096, 4096, 4096, 1024] + - [17, 12404.0] + - - [1024, 3072, 1, 10208, 1024, 1024, 1024, 3072] + - [6, 12061.0] + - - [1024, 4096, 1, 10208, 1024, 1024, 1024, 4096] + - [17, 12407.0] + - - [1024, 2048, 1, 10224, 1024, 1024, 1024, 2048] + - [3, 11767.0] + - - [1024, 2048, 1, 10240, 1024, 1024, 1024, 2048] + - [16, 11769.0] + - - [1024, 2048, 1, 10192, 1024, 1024, 1024, 2048] + - [3, 11770.0] + - - [1024, 3072, 1, 10080, 1024, 1024, 1024, 3072] + - [6, 12057.0] + - - [100352, 256, 1, 512, 100352, 100352, 100352, 256] + - [4, 12582.0] + - - [12544, 1024, 1, 2048, 12544, 12544, 12544, 1024] + - [29, 12508.0] + - - [12544, 147, 1, 64, 12544, 12544, 12544, 147] + - [14, 6751.0] + - - [200704, 256, 1, 512, 200704, 200704, 200704, 256] + - [35, 12710.0] + - - [25088, 512, 1, 1024, 25088, 25088, 25088, 512] + - [29, 12461.0] + - - [3136, 576, 1, 64, 3136, 3136, 3136, 576] + - [20, 8141.0] + - - [50176, 512, 1, 1024, 50176, 50176, 50176, 512] + - [29, 12680.0] + - - [6272, 1024, 1, 2048, 6272, 6272, 6272, 1024] + - [17, 12395.0] + - - [196, 1024, 128, 512, 196, 196, 196, 1024] + - [1, 9563.0] + - - [196, 1024, 256, 512, 196, 196, 196, 1024] + - [1, 9631.0] + - - [3136, 256, 128, 128, 3136, 3136, 3136, 256] + - [29, 12015.0] + - - [3136, 256, 256, 128, 3136, 3136, 3136, 256] + - [17, 12098.0] + - - [784, 512, 128, 256, 784, 784, 784, 512] + - [1, 10945.0] + - - [784, 512, 256, 256, 784, 784, 784, 512] + - [1, 11000.0] + - - [30528, 1024, 1, 2560, 30528, 30528, 30528, 1024] + - [17, 12735.0] + - - [1024, 4096, 1, 12288, 1024, 1024, 1024, 4096] + - [17, 12406.0] + - - [30528, 1024, 1, 1920, 30528, 30528, 30528, 1024] + - [29, 12721.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 4096, 1024] + - [17, 12410.0] + - - [25600, 128, 25, 128, 25600, 25600, 25600, 128] + - [16, 10673.0] + - - [12544, 128, 36, 128, 12544, 12544, 12544, 128] + - [1, 11167.0] + - - [9216, 128, 49, 128, 9216, 9216, 9216, 128] + - [21, 11078.0] + - - [6400, 128, 64, 128, 6400, 6400, 6400, 128] + - [16, 11195.0] + - - [6400, 256, 25, 256, 6400, 6400, 6400, 256] + - [11, 12550.0] + - - [4096, 256, 36, 256, 4096, 4096, 4096, 256] + - [4, 12510.0] + - - [2304, 256, 49, 256, 2304, 2304, 2304, 256] + - [17, 12427.0] + - - [2304, 256, 64, 256, 2304, 2304, 2304, 256] + - [29, 12510.0] + - - [2304, 512, 25, 512, 2304, 2304, 2304, 512] + - [29, 12697.0] + - - [1024, 512, 36, 512, 1024, 1024, 1024, 512] + - [17, 12557.0] + - - [1024, 512, 49, 512, 1024, 1024, 1024, 512] + - [17, 12593.0] + - - [1024, 512, 64, 512, 1024, 1024, 1024, 512] + - [29, 12644.0] + - - [3072, 768, 1, 2048, 3072, 3072, 3072, 768] + - [14, 11442.0] + - - [768, 3072, 1, 2048, 768, 768, 768, 3072] + - [3, 11461.0] + - - [3072, 768, 1, 4608, 3072, 3072, 3072, 768] + - [16, 11528.0] + - - [768, 3072, 1, 4608, 768, 768, 768, 3072] + - [28, 11505.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4096, 1024] + - [29, 12381.0] + - - [1024, 4096, 1, 4608, 1024, 1024, 1024, 4096] + - [17, 12386.0] + - - [4880, 256, 49, 256, 4880, 4880, 4880, 256] + - [29, 12267.0] + - - [3128, 256, 64, 256, 3128, 3128, 3128, 256] + - [17, 12214.0] + - - [4680, 256, 49, 256, 4680, 4680, 4680, 256] + - [29, 12357.0] + - - [5280, 256, 36, 256, 5280, 5280, 5280, 256] + - [11, 12285.0] + - - [2640, 256, 64, 256, 2640, 2640, 2640, 256] + - [4, 12249.0] + - - [5304, 256, 49, 256, 5304, 5304, 5304, 256] + - [29, 12367.0] + - - [4524, 256, 49, 256, 4524, 4524, 4524, 256] + - [29, 12266.0] + - - [2760, 256, 64, 256, 2760, 2760, 2760, 256] + - [29, 12199.0] + - - [6440, 256, 36, 256, 6440, 6440, 6440, 256] + - [29, 12358.0] + - - [5704, 256, 36, 256, 5704, 5704, 5704, 256] + - [4, 12349.0] + - - [2666, 256, 64, 256, 2666, 2666, 2666, 256] + - [29, 12258.0] + - - [2128, 256, 64, 256, 2128, 2128, 2128, 256] + - [29, 12179.0] + - - [1160, 256, 49, 256, 1160, 1160, 1160, 256] + - [29, 11038.0] + - - [4056, 256, 49, 256, 4056, 4056, 4056, 256] + - [4, 12379.0] + - - [6144, 256, 36, 256, 6144, 6144, 6144, 256] + - [4, 12594.0] + - - [950, 2048, 2, 512, 950, 950, 950, 2048] + - [5, 11054.0] + - - [6336, 256, 36, 256, 6336, 6336, 6336, 256] + - [1, 12402.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 512] + - [5, 11811.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 512] + - [1, 11950.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 128] + - [18, 11843.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 128] + - [30, 11587.0] + - - [5632, 256, 36, 256, 5632, 5632, 5632, 256] + - [4, 12568.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 128] + - [12, 11536.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 128] + - [25, 11207.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 1024] + - [5, 11541.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 128] + - [18, 11918.0] + - - [782, 128, 64, 128, 782, 782, 782, 128] + - [25, 10081.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 512] + - [35, 11886.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 128] + - [26, 11902.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 128] + - [20, 11191.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 128] + - [25, 11590.0] + - - [13600, 512, 2, 256, 13600, 13600, 13600, 512] + - [9, 12253.0] + - - [15200, 512, 2, 256, 15200, 15200, 15200, 512] + - [21, 12202.0] + - - [850, 2048, 2, 512, 850, 850, 850, 2048] + - [14, 10912.0] + - - [768, 2048, 2, 512, 768, 768, 768, 2048] + - [1, 11681.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 512] + - [9, 11742.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 128] + - [20, 11092.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 512] + - [9, 11956.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 512] + - [26, 11768.0] + - - [805, 2048, 2, 512, 805, 805, 805, 2048] + - [14, 10363.0] + - - [6912, 256, 36, 256, 6912, 6912, 6912, 256] + - [4, 12596.0] + - - [713, 2048, 2, 512, 713, 713, 713, 2048] + - [5, 10688.0] + - - [13824, 512, 2, 256, 13824, 13824, 13824, 512] + - [17, 12225.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 512] + - [0, 11735.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 512] + - [18, 11829.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 128] + - [25, 11431.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 128] + - [25, 11427.0] + - - [864, 2048, 2, 512, 864, 864, 864, 2048] + - [25, 11109.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 512] + - [8, 11585.0] + - - [672, 2048, 2, 512, 672, 672, 672, 2048] + - [5, 10196.0] + - - [660, 2048, 2, 512, 660, 660, 660, 2048] + - [5, 9988.0] + - - [9408, 128, 2, 512, 9408, 9408, 9408, 128] + - [0, 11239.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 512] + - [36, 11835.0] + - - [726, 2048, 2, 512, 726, 726, 726, 2048] + - [1, 10952.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 512] + - [12, 11673.0] + - - [1240, 256, 49, 256, 1240, 1240, 1240, 256] + - [29, 11778.0] + - - [1024, 256, 1, 33536, 1024, 1024, 1024, 256] + - [45, 10576.0] + - - [1024, 1024, 1, 9520, 1024, 1024, 1024, 1024] + - [39, 11951.0] + - - [1024, 1024, 1, 10200, 1024, 1024, 1024, 1024] + - [37, 11969.0] + - - [1024, 256, 1, 21248, 1024, 1024, 1024, 256] + - [38, 10457.0] + - - [1024, 256, 1, 21504, 1024, 1024, 1024, 256] + - [38, 10467.0] + - - [1024, 256, 1, 22016, 1024, 1024, 1024, 256] + - [42, 10468.0] + - - [1024, 256, 1, 28672, 1024, 1024, 1024, 256] + - [42, 10538.0] + - - [256, 2560, 1, 8976, 256, 256, 256, 2560] + - [44, 11735.0] + - - [256, 2816, 1, 8976, 256, 256, 256, 2816] + - [44, 11606.0] + - - [256, 3328, 1, 8976, 256, 256, 256, 3328] + - [38, 11535.0] + - - [256, 3584, 1, 8976, 256, 256, 256, 3584] + - [40, 11412.0] + - - [256, 3840, 1, 8976, 256, 256, 256, 3840] + - [18, 12003.0] + - - [256, 4096, 1, 8976, 256, 256, 256, 4096] + - [39, 11845.0] + - - [256, 4352, 1, 8976, 256, 256, 256, 4352] + - [39, 11744.0] + - - [1024, 1024, 1, 32768, 1024, 1024, 1024, 1024] + - [39, 12262.0] + - - [1024, 512, 1, 32768, 1024, 1024, 1024, 512] + - [42, 11633.0] + - - [479, 1024, 1, 32768, 479, 479, 479, 1024] + - [38, 10880.0] + - - [512, 256, 1, 55296, 512, 512, 512, 256] + - [46, 9355.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 1024, 1024] + - [37, 11866.0] + - - [1024, 1024, 1, 9600, 1024, 1024, 1024, 1024] + - [41, 11936.0] + - - [1024, 1024, 1, 10064, 1024, 1024, 1024, 1024] + - [37, 11955.0] + - - [1024, 1024, 1, 10080, 1024, 1024, 1024, 1024] + - [37, 11954.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 1024, 1024] + - [39, 11936.0] + - - [480, 1024, 1, 32768, 480, 480, 480, 1024] + - [38, 10914.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 1024, 1024] + - [43, 12105.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 1024, 1024] + - [37, 11967.0] + - - [1024, 1024, 1, 10496, 1024, 1024, 1024, 1024] + - [39, 11960.0] + - - [1024, 1024, 1, 10224, 1024, 1024, 1024, 1024] + - [39, 11970.0] + - - [1024, 1024, 1, 10192, 1024, 1024, 1024, 1024] + - [37, 11966.0] + - - [1024, 1024, 1, 10208, 1024, 1024, 1024, 1024] + - [37, 11960.0] + - - [1024, 1024, 1, 10184, 1024, 1024, 1024, 1024] + - [37, 11961.0] + - - [1024, 1024, 1, 10120, 1024, 1024, 1024, 1024] + - [37, 11962.0] + - - [1024, 1024, 1, 10152, 1024, 1024, 1024, 1024] + - [37, 11969.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 1024, 1024] + - [39, 12021.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] + - [51, 10187.0] + - - [1024, 1024, 1, 200, 1024, 1024, 1024, 1024] + - [73, 9883.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 1024] + - [92, 10663.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] + - [92, 10574.0] + - - [768, 768, 1, 16, 768, 768, 768, 768] + - [78, 2931.0] + - - [768, 768, 1, 320, 768, 768, 768, 768] + - [74, 9153.0] + - - [768, 768, 1, 4096, 768, 768, 768, 768] + - [82, 10477.0] + - - [768, 768, 1, 32, 768, 768, 768, 768] + - [103, 4309.0] + - - [768, 768, 1, 640, 768, 768, 768, 768] + - [93, 9805.0] + - - [768, 768, 1, 64, 768, 768, 768, 768] + - [49, 6312.0] + - - [768, 768, 1, 1280, 768, 768, 768, 768] + - [74, 10131.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] + - [92, 10637.0] + - - [1024, 1024, 1, 120, 1024, 1024, 1024, 1024] + - [86, 9145.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1024, 1024] + - [65, 217.0] + - - [1024, 1024, 1, 20, 1024, 1024, 1024, 1024] + - [69, 3318.0] + - - [1024, 1024, 1, 4, 1024, 1024, 1024, 1024] + - [53, 874.0] + - - [1024, 1024, 1, 6, 1024, 1024, 1024, 1024] + - [69, 1300.0] + - - [1024, 1024, 1, 80, 1024, 1024, 1024, 1024] + - [49, 7626.0] + - - [128, 64, 512, 128, 128, 128, 128, 64] + - [92, 11176.0] + - - [512, 64, 64, 512, 512, 512, 512, 64] + - [92, 11317.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [71, 7737.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 448] + - [57, 9937.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 6784] + - [79, 10519.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 448] + - [49, 8783.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 4288] + - [51, 11139.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 1856] + - [91, 11204.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1024] + - [49, 9471.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 1408] + - [92, 8023.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1856] + - [49, 11190.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [92, 8334.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 128] + - [49, 7265.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [49, 7476.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [51, 9628.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1856] + - [92, 10211.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 3584] + - [73, 9869.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [51, 10160.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [72, 6060.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [77, 8346.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 256] + - [73, 10437.0] + - - [704, 1024, 1, 128, 704, 704, 704, 1024] + - [91, 8313.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 448] + - [96, 10541.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 704] + - [51, 10131.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [92, 7985.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [51, 9533.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 1408] + - [49, 10089.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 256] + - [92, 8691.0] + - - [448, 2944, 1, 128, 448, 448, 448, 2944] + - [49, 9691.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 128] + - [74, 9254.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 128] + - [51, 7146.0] + - - [448, 1408, 1, 256, 448, 448, 448, 1408] + - [91, 9249.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [71, 9522.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 448] + - [49, 7232.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3584] + - [79, 11129.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [91, 9212.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 704] + - [51, 9182.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [92, 7458.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 256] + - [51, 9233.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 128] + - [74, 9251.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [51, 6991.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 256] + - [71, 9753.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [75, 8115.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1024] + - [51, 9748.0] + - - [448, 1856, 1, 128, 448, 448, 448, 1856] + - [91, 8460.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 704] + - [51, 9758.0] + - - [128, 5888, 1, 256, 128, 128, 128, 5888] + - [92, 9523.0] + - - [704, 704, 1, 3328, 704, 704, 704, 704] + - [98, 8624.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1408] + - [91, 10020.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 256] + - [98, 11138.0] + - - [704, 1856, 1, 128, 704, 704, 704, 1856] + - [49, 10286.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3584] + - [92, 10069.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 2944] + - [73, 8170.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 128] + - [51, 8377.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 1408] + - [75, 10483.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [92, 10089.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 2944] + - [51, 10445.0] + - - [448, 2368, 1, 128, 448, 448, 448, 2368] + - [49, 9456.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 704] + - [73, 9619.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 2944] + - [71, 10596.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [104, 6881.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 128] + - [79, 10526.0] + - - [704, 704, 1, 256, 704, 704, 704, 704] + - [91, 7120.0] + - - [448, 704, 1, 1280, 448, 448, 448, 704] + - [71, 9142.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 448] + - [55, 10089.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 704] + - [91, 11101.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1408] + - [95, 10254.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [92, 10562.0] + - - [448, 1024, 1, 128, 448, 448, 448, 1024] + - [91, 7214.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 2368] + - [91, 10410.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 64] + - [85, 6109.0] + - - [704, 1024, 1, 256, 704, 704, 704, 1024] + - [71, 9029.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 6784] + - [57, 10396.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [91, 8566.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 4288] + - [73, 11034.0] + - - [256, 1856, 1, 128, 256, 256, 256, 1856] + - [51, 7564.0] + - - [448, 1408, 1, 128, 448, 448, 448, 1408] + - [91, 8290.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 128] + - [68, 9533.0] + - - [704, 448, 1, 256, 704, 704, 704, 448] + - [104, 7261.0] + - - [704, 1408, 1, 128, 704, 704, 704, 1408] + - [49, 9102.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 448] + - [49, 9647.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [50, 6216.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 704] + - [92, 9996.0] + - - [128, 4288, 1, 256, 128, 128, 128, 4288] + - [92, 7815.0] + - - [704, 448, 1, 3328, 704, 704, 704, 448] + - [91, 9347.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 1024] + - [73, 10652.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 2368] + - [91, 10390.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [75, 9262.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 256] + - [92, 10157.0] + - - [256, 2368, 1, 128, 256, 256, 256, 2368] + - [92, 8134.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 704] + - [49, 10735.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 448] + - [77, 10775.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [104, 8738.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 448] + - [79, 9683.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [92, 8660.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] + - [51, 10046.0] + - - [704, 1856, 1, 256, 704, 704, 704, 1856] + - [49, 10755.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 4288] + - [57, 9576.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 2368] + - [93, 10175.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 448] + - [71, 10091.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 448] + - [71, 8460.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 128] + - [71, 8680.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [49, 7827.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [91, 9184.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 256] + - [93, 10170.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 448] + - [73, 10518.0] + - - [128, 3584, 1, 256, 128, 128, 128, 3584] + - [51, 8585.0] + - - [704, 448, 1, 1280, 704, 704, 704, 448] + - [49, 9096.0] + - - [128, 5056, 1, 256, 128, 128, 128, 5056] + - [51, 9014.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 256] + - [51, 10779.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 128] + - [52, 9579.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 256] + - [50, 5946.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1408] + - [92, 7825.0] + - - [128, 2368, 1, 256, 128, 128, 128, 2368] + - [92, 7211.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [77, 9525.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 2944] + - [55, 8302.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 448] + - [91, 10604.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [106, 10355.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 128] + - [92, 9476.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 448] + - [51, 10736.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [91, 7449.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 704] + - [51, 8404.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 5056] + - [64, 11159.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 128] + - [49, 7546.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 448] + - [73, 8376.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 5888] + - [51, 10282.0] + - - [704, 448, 1, 128, 704, 704, 704, 448] + - [50, 6154.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [98, 10089.0] + - - [128, 2944, 1, 256, 128, 128, 128, 2944] + - [51, 7146.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [49, 8752.0] + - - [448, 1856, 1, 256, 448, 448, 448, 1856] + - [49, 9050.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 128] + - [96, 10068.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 448] + - [55, 9761.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 128] + - [77, 10329.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 704] + - [92, 9940.0] + - - [448, 2944, 1, 256, 448, 448, 448, 2944] + - [49, 10182.0] + - - [448, 2368, 1, 256, 448, 448, 448, 2368] + - [49, 9948.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 2368] + - [74, 9232.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 128] + - [93, 10767.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [73, 9574.0] + - - [64, 5888, 1, 128, 64, 64, 64, 5888] + - [50, 6137.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 128] + - [52, 11185.0] + - - [448, 704, 1, 256, 448, 448, 448, 704] + - [71, 7340.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 128] + - [96, 8357.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 5056] + - [74, 10764.0] + - - [704, 704, 1, 128, 704, 704, 704, 704] + - [104, 6706.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [91, 9016.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [50, 6006.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [49, 8284.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [51, 10504.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 1024] + - [49, 9795.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 2368] + - [93, 10538.0] + - - [256, 3584, 1, 128, 256, 256, 256, 3584] + - [92, 9321.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 256] + - [49, 9559.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 256] + - [49, 8100.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [109, 8286.0] + - - [256, 2944, 1, 128, 256, 256, 256, 2944] + - [73, 8722.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 256] + - [73, 8013.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 448] + - [49, 9583.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 256] + - [51, 11062.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 704] + - [104, 10087.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [49, 7380.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 448] + - [51, 9533.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [55, 9259.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [49, 6129.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [92, 9226.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 448] + - [91, 10571.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1856] + - [57, 9754.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 256] + - [50, 6362.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 128] + - [51, 9424.0] + - - [448, 1024, 1, 256, 448, 448, 448, 1024] + - [91, 8473.0] + - - [64, 6784, 1, 128, 64, 64, 64, 6784] + - [49, 6696.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [51, 8104.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [92, 7161.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [49, 6825.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 5888] + - [63, 10382.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 256] + - [73, 7812.0] + - - [64, 5056, 1, 128, 64, 64, 64, 5056] + - [50, 6200.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 64] + - [92, 6090.0] + - - [448, 704, 1, 128, 448, 448, 448, 704] + - [50, 6098.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 704] + - [51, 9089.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 256] + - [111, 10474.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 128] + - [92, 10285.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 3584] + - [79, 10968.0] + - - [256, 1408, 1, 128, 256, 256, 256, 1408] + - [94, 5961.0] + - - [256, 4288, 1, 128, 256, 256, 256, 4288] + - [92, 9704.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [51, 8644.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 256] + - [51, 10320.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [91, 7920.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 64] + - [73, 6777.0] + - - [704, 704, 1, 1280, 704, 704, 704, 704] + - [92, 8338.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 2368] + - [92, 8907.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 256] + - [98, 10972.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 128] + - [73, 9757.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 1856] + - [57, 9994.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 448] + - [86, 8461.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 128] + - [92, 8137.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [50, 5932.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 2944] + - [51, 10293.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 1024] + - [92, 9924.0] + - - [128, 6784, 1, 256, 128, 128, 128, 6784] + - [73, 9508.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 1856] + - [55, 10441.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [73, 8752.0] + - - [704, 1408, 1, 256, 704, 704, 704, 1408] + - [91, 9455.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [51, 6886.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 2944] + - [71, 10555.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 128] + - [110, 10370.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 448] + - [71, 9043.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 4288] + - [74, 9256.0] + - - [448, 704, 1, 3328, 448, 448, 448, 704] + - [49, 9338.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 704] + - [91, 11195.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [47, 6085.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [92, 8839.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [73, 8787.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [49, 5669.0] + - - [64, 1536, 64, 384, 64, 64, 64, 1536] + - [91, 9742.0] + - - [64, 1536, 64, 256, 64, 64, 64, 1536] + - [91, 9892.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [71, 7209.0] + - - [1024, 1024, 1, 3975, 1024, 1024, 1024, 1024] + - [92, 10665.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [91, 9222.0] + - - [64, 102, 624, 100, 64, 64, 64, 102] + - [71, 7825.0] + - - [64, 112, 576, 111, 64, 64, 64, 112] + - [91, 8468.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [49, 7960.0] + - - [64, 133, 480, 135, 64, 64, 64, 133] + - [71, 7612.0] + - - [1024, 1024, 1, 4026, 1024, 1024, 1024, 1024] + - [92, 10646.0] + - - [64, 160, 400, 159, 64, 64, 64, 160] + - [91, 9088.0] + - - [1024, 1024, 1, 3780, 1024, 1024, 1024, 1024] + - [92, 10650.0] + - - [64, 228, 272, 232, 64, 64, 64, 228] + - [49, 9838.0] + - - [1024, 1024, 1, 3822, 1024, 1024, 1024, 1024] + - [86, 10651.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [91, 6218.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [91, 8981.0] + - - [64, 135, 480, 134, 64, 64, 64, 135] + - [91, 7693.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [91, 7568.0] + - - [1024, 1024, 1, 3942, 1024, 1024, 1024, 1024] + - [73, 10674.0] + - - [1024, 1024, 1, 3861, 1024, 1024, 1024, 1024] + - [73, 10652.0] + - - [1024, 1024, 1, 4000, 1024, 1024, 1024, 1024] + - [92, 10669.0] + - - [1024, 1024, 1, 3870, 1024, 1024, 1024, 1024] + - [92, 10636.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [104, 5147.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [91, 7562.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [71, 10010.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [49, 8344.0] + - - [1024, 1024, 1, 4032, 1024, 1024, 1024, 1024] + - [73, 10698.0] + - - [1024, 1024, 1, 4012, 1024, 1024, 1024, 1024] + - [92, 10652.0] + - - [1024, 1024, 1, 3681, 1024, 1024, 1024, 1024] + - [92, 10647.0] + - - [1024, 1024, 1, 3927, 1024, 1024, 1024, 1024] + - [92, 10655.0] + - - [1024, 1024, 1, 3894, 1024, 1024, 1024, 1024] + - [92, 10656.0] + - - [64, 132, 480, 135, 64, 64, 64, 132] + - [91, 7444.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [49, 7699.0] + - - [1024, 1024, 1, 3876, 1024, 1024, 1024, 1024] + - [73, 10680.0] + - - [64, 84, 752, 85, 64, 64, 64, 84] + - [91, 6639.0] + - - [1024, 1024, 1, 4050, 1024, 1024, 1024, 1024] + - [92, 10693.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [49, 7476.0] + - - [64, 99, 624, 102, 64, 64, 64, 99] + - [71, 7797.0] + - - [64, 143, 432, 148, 64, 64, 64, 143] + - [49, 8195.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 1024, 1024] + - [92, 10655.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [71, 9148.0] + - - [64, 148, 432, 147, 64, 64, 64, 148] + - [49, 8368.0] + - - [1024, 1024, 1, 3960, 1024, 1024, 1024, 1024] + - [73, 10687.0] + - - [64, 123, 528, 122, 64, 64, 64, 123] + - [91, 9193.0] + - - [64, 102, 624, 101, 64, 64, 64, 102] + - [71, 7912.0] + - - [1024, 1024, 1, 3978, 1024, 1024, 1024, 1024] + - [92, 10661.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [71, 9080.0] + - - [1024, 1024, 1, 3995, 1024, 1024, 1024, 1024] + - [106, 10662.0] + - - [64, 132, 480, 134, 64, 64, 64, 132] + - [49, 7448.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [91, 8480.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [91, 7877.0] + - - [1024, 1024, 1, 3977, 1024, 1024, 1024, 1024] + - [92, 10651.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [71, 8712.0] + - - [64, 159, 400, 162, 64, 64, 64, 159] + - [91, 8837.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [71, 9235.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [49, 9836.0] + - - [1024, 1024, 1, 3925, 1024, 1024, 1024, 1024] + - [92, 10659.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [91, 7429.0] + - - [1024, 1024, 1, 3956, 1024, 1024, 1024, 1024] + - [73, 10675.0] + - - [1024, 1024, 1, 3976, 1024, 1024, 1024, 1024] + - [92, 10679.0] + - - [64, 111, 576, 112, 64, 64, 64, 111] + - [49, 8465.0] + - - [64, 100, 624, 102, 64, 64, 64, 100] + - [71, 7828.0] + - - [1024, 1024, 1, 3955, 1024, 1024, 1024, 1024] + - [73, 10658.0] + - - [1024, 1024, 1, 4030, 1024, 1024, 1024, 1024] + - [73, 10656.0] + - - [1024, 1024, 1, 3906, 1024, 1024, 1024, 1024] + - [92, 10666.0] + - - [64, 101, 624, 102, 64, 64, 64, 101] + - [91, 7810.0] + - - [1024, 1024, 1, 3796, 1024, 1024, 1024, 1024] + - [92, 10655.0] + - - [1024, 1024, 1, 3859, 1024, 1024, 1024, 1024] + - [92, 10660.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [71, 5749.0] + - - [1024, 1024, 1, 3860, 1024, 1024, 1024, 1024] + - [92, 10661.0] + - - [1024, 1024, 1, 4005, 1024, 1024, 1024, 1024] + - [92, 10700.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [91, 6688.0] + - - [1024, 1024, 1, 3990, 1024, 1024, 1024, 1024] + - [92, 10719.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [71, 7632.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [71, 6322.0] + - - [1024, 1024, 1, 3999, 1024, 1024, 1024, 1024] + - [51, 10693.0] + - - [1024, 1024, 1, 4020, 1024, 1024, 1024, 1024] + - [92, 10660.0] + - - [1024, 1024, 1, 3939, 1024, 1024, 1024, 1024] + - [92, 10655.0] + - - [64, 77, 816, 78, 64, 64, 64, 77] + - [91, 6216.0] + - - [1024, 1024, 1, 4059, 1024, 1024, 1024, 1024] + - [73, 10662.0] + - - [1024, 1024, 1, 3944, 1024, 1024, 1024, 1024] + - [92, 10649.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [61, 8374.0] + - - [1024, 1024, 1, 3720, 1024, 1024, 1024, 1024] + - [73, 10693.0] + - - [1024, 1024, 1, 3910, 1024, 1024, 1024, 1024] + - [92, 10671.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [91, 8081.0] + - - [64, 92, 688, 93, 64, 64, 64, 92] + - [49, 7298.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [91, 7859.0] + - - [1024, 1024, 1, 3969, 1024, 1024, 1024, 1024] + - [63, 10660.0] + - - [1024, 1024, 1, 3948, 1024, 1024, 1024, 1024] + - [73, 10660.0] + - - [1024, 1024, 1, 3996, 1024, 1024, 1024, 1024] + - [92, 10648.0] + - - [1024, 1024, 1, 3900, 1024, 1024, 1024, 1024] + - [92, 10668.0] + - - [1024, 1024, 1, 3640, 1024, 1024, 1024, 1024] + - [73, 10671.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [71, 8382.0] + - - [1024, 1024, 1, 3751, 1024, 1024, 1024, 1024] + - [73, 10652.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [91, 9827.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [71, 6877.0] + - - [1024, 1024, 1, 3712, 1024, 1024, 1024, 1024] + - [73, 10659.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 1024] + - [91, 8960.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [71, 11225.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [49, 10807.0] + - - [64, 192, 36, 25088, 64, 64, 64, 192] + - [65, 8530.0] + - - [128, 128, 64, 25, 128, 128, 128, 128] + - [90, 3651.0] + - - [64, 192, 64, 3200, 64, 64, 64, 192] + - [61, 8990.0] + - - [64, 128, 64, 23104, 64, 64, 64, 128] + - [49, 7423.0] + - - [128, 128, 64, 1600, 128, 128, 128, 128] + - [66, 10372.0] + - - [80, 192, 64, 4608, 80, 80, 80, 192] + - [77, 7054.0] + - - [64, 128, 36, 30, 64, 64, 64, 128] + - [101, 2404.0] + - - [64, 128, 64, 11552, 64, 64, 64, 128] + - [84, 7404.0] + - - [128, 192, 64, 946, 128, 128, 128, 192] + - [51, 11073.0] + - - [64, 192, 64, 12800, 64, 64, 64, 192] + - [104, 8857.0] + - - [224, 224, 64, 128, 224, 224, 224, 224] + - [71, 8069.0] + - - [128, 128, 64, 3360, 128, 128, 128, 128] + - [96, 10397.0] + - - [128, 128, 64, 420, 128, 128, 128, 128] + - [92, 9932.0] + - - [64, 128, 64, 361, 64, 64, 64, 128] + - [91, 7906.0] + - - [64, 128, 36, 53824, 64, 64, 64, 128] + - [108, 7850.0] + - - [128, 160, 36, 512, 128, 128, 128, 160] + - [51, 8042.0] + - - [147, 64, 36, 18816, 147, 147, 147, 64] + - [75, 7310.0] + - - [96, 128, 64, 946, 96, 96, 96, 128] + - [92, 7670.0] + - - [128, 128, 64, 50, 128, 128, 128, 128] + - [49, 6242.0] + - - [160, 224, 36, 128, 160, 160, 160, 224] + - [49, 6802.0] + - - [192, 224, 64, 1152, 192, 192, 192, 224] + - [84, 9913.0] + - - [128, 128, 36, 784, 128, 128, 128, 128] + - [74, 9606.0] + - - [96, 128, 64, 288, 96, 96, 96, 128] + - [106, 7056.0] + - - [128, 128, 64, 400, 128, 128, 128, 128] + - [51, 10210.0] + - - [128, 128, 64, 800, 128, 128, 128, 128] + - [73, 10441.0] + - - [96, 128, 36, 512, 96, 96, 96, 128] + - [86, 6814.0] + - - [96, 128, 64, 800, 96, 96, 96, 128] + - [73, 7737.0] + - - [192, 224, 64, 128, 192, 192, 192, 224] + - [49, 9113.0] + - - [128, 128, 64, 288, 128, 128, 128, 128] + - [92, 10013.0] + - - [96, 208, 36, 512, 96, 96, 96, 208] + - [57, 6232.0] + - - [64, 128, 36, 1568, 64, 64, 64, 128] + - [91, 8454.0] + - - [192, 192, 36, 512, 192, 192, 192, 192] + - [49, 10386.0] + - - [128, 128, 36, 512, 128, 128, 128, 128] + - [93, 9587.0] + - - [96, 208, 64, 1152, 96, 96, 96, 208] + - [51, 6967.0] + - - [128, 192, 64, 3200, 128, 128, 128, 192] + - [55, 11247.0] + - - [160, 160, 64, 288, 160, 160, 160, 160] + - [49, 7224.0] + - - [128, 128, 36, 440, 128, 128, 128, 128] + - [74, 9465.0] + - - [96, 128, 36, 1568, 96, 96, 96, 128] + - [52, 7417.0] + - - [112, 224, 36, 2048, 112, 112, 112, 224] + - [88, 8180.0] + - - [128, 128, 36, 7040, 128, 128, 128, 128] + - [111, 10395.0] + - - [128, 128, 36, 1568, 128, 128, 128, 128] + - [74, 10018.0] + - - [160, 224, 64, 128, 160, 160, 160, 224] + - [104, 7513.0] + - - [192, 224, 36, 2592, 192, 192, 192, 224] + - [91, 9747.0] + - - [64, 128, 64, 2888, 64, 64, 64, 128] + - [87, 8330.0] + - - [64, 128, 36, 480, 64, 64, 64, 128] + - [71, 7085.0] + - - [147, 64, 64, 9702, 147, 147, 147, 64] + - [104, 7617.0] + - - [64, 192, 64, 3698, 64, 64, 64, 192] + - [84, 8918.0] + - - [73, 192, 64, 10439, 73, 73, 73, 192] + - [77, 6485.0] + - - [128, 128, 36, 880, 128, 128, 128, 128] + - [93, 9687.0] + - - [192, 224, 36, 128, 192, 192, 192, 224] + - [61, 8063.0] + - - [64, 128, 36, 12544, 64, 64, 64, 128] + - [65, 7850.0] + - - [160, 160, 36, 512, 160, 160, 160, 160] + - [91, 7066.0] + - - [128, 128, 36, 3136, 128, 128, 128, 128] + - [57, 10225.0] + - - [112, 224, 36, 512, 112, 112, 112, 224] + - [51, 7674.0] + - - [128, 128, 36, 49, 128, 128, 128, 128] + - [69, 4327.0] + - - [112, 224, 64, 1152, 112, 112, 112, 224] + - [63, 8763.0] + - - [128, 192, 36, 1568, 128, 128, 128, 192] + - [73, 10241.0] + - - [128, 192, 36, 512, 128, 128, 128, 192] + - [73, 9796.0] + - - [192, 192, 64, 288, 192, 192, 192, 192] + - [71, 10984.0] + - - [96, 208, 64, 242, 96, 96, 96, 208] + - [51, 6561.0] + - - [64, 128, 64, 5776, 64, 64, 64, 128] + - [104, 7397.0] + - - [128, 192, 64, 288, 128, 128, 128, 192] + - [51, 10539.0] + - - [96, 128, 36, 6272, 96, 96, 96, 128] + - [111, 7747.0] + - - [96, 128, 64, 3200, 96, 96, 96, 128] + - [55, 7855.0] + - - [128, 192, 64, 800, 128, 128, 128, 192] + - [92, 10978.0] + - - [64, 128, 64, 10, 64, 64, 64, 128] + - [97, 1618.0] + - - [96, 208, 64, 288, 96, 96, 96, 208] + - [73, 6694.0] + - - [64, 128, 64, 160, 64, 64, 64, 128] + - [49, 7219.0] + - - [128, 128, 64, 1568, 128, 128, 128, 128] + - [109, 10354.0] + - - [112, 224, 64, 242, 112, 112, 112, 224] + - [91, 7891.0] + - - [160, 192, 64, 288, 160, 160, 160, 192] + - [49, 8937.0] + - - [128, 160, 64, 288, 128, 128, 128, 160] + - [51, 8861.0] + - - [128, 128, 64, 210, 128, 128, 128, 128] + - [92, 9283.0] + - - [73, 192, 36, 23360, 73, 73, 73, 192] + - [96, 5968.0] + - - [160, 192, 36, 512, 160, 160, 160, 192] + - [49, 8320.0] + - - [64, 128, 64, 722, 64, 64, 64, 128] + - [95, 8312.0] + - - [112, 224, 64, 288, 112, 112, 112, 224] + - [91, 8240.0] + - - [64, 192, 36, 6272, 64, 64, 64, 192] + - [75, 8679.0] + - - [64, 128, 36, 6272, 64, 64, 64, 128] + - [99, 8310.0] + - - [128, 128, 36, 3200, 128, 128, 128, 128] + - [88, 10213.0] + - - [128, 128, 36, 392, 128, 128, 128, 128] + - [74, 8900.0] + - - [80, 192, 36, 10368, 80, 80, 80, 192] + - [77, 6513.0] + - - [224, 224, 36, 128, 224, 224, 224, 224] + - [49, 7631.0] + - - [64, 128, 36, 784, 64, 64, 64, 128] + - [71, 8090.0] + - - [128, 128, 64, 200, 128, 128, 128, 128] + - [51, 9691.0] + - - [5329, 64, 32, 80, 5329, 5329, 5329, 64] + - [51, 10668.0] + - - [64, 2048, 32, 384, 64, 64, 64, 2048] + - [91, 11155.0] + - - [289, 1792, 1, 320, 289, 289, 289, 1792] + - [49, 7218.0] + - - [1001, 1024, 1, 32, 1001, 1001, 1001, 1024] + - [70, 4955.0] + - - [784, 400, 1, 32, 784, 784, 784, 400] + - [60, 2987.0] + - - [64, 1536, 32, 256, 64, 64, 64, 1536] + - [104, 11222.0] + - - [289, 2592, 1, 384, 289, 289, 289, 2592] + - [49, 7973.0] + - - [64, 2048, 32, 448, 64, 64, 64, 2048] + - [61, 10613.0] + - - [289, 2016, 1, 256, 289, 289, 289, 2016] + - [49, 7610.0] + - - [64, 1536, 32, 384, 64, 64, 64, 1536] + - [71, 11268.0] + - - [64, 1280, 32, 320, 64, 64, 64, 1280] + - [49, 11087.0] + - - [289, 3456, 1, 384, 289, 289, 289, 3456] + - [49, 9260.0] + - - [64, 1280, 32, 384, 64, 64, 64, 1280] + - [71, 11072.0] + - - [729, 1600, 1, 192, 729, 729, 729, 1600] + - [106, 9339.0] + - - [289, 1344, 1, 192, 289, 289, 289, 1344] + - [49, 6396.0] + - - [64, 2048, 32, 320, 64, 64, 64, 2048] + - [71, 11357.0] + - - [64, 1280, 32, 448, 64, 64, 64, 1280] + - [49, 11164.0] + - - [64, 1280, 32, 192, 64, 64, 64, 1280] + - [49, 10654.0] + - - [289, 1792, 1, 256, 289, 289, 289, 1792] + - [71, 7448.0] + - - [64, 2048, 32, 192, 64, 64, 64, 2048] + - [49, 11132.0] + - - [5329, 64, 128, 80, 5329, 5329, 5329, 64] + - [47, 5886.0] + - - [64, 1280, 128, 448, 64, 64, 64, 1280] + - [71, 9664.0] + - - [64, 2048, 128, 192, 64, 64, 64, 2048] + - [61, 8299.0] + - - [64, 1280, 128, 384, 64, 64, 64, 1280] + - [61, 9391.0] + - - [64, 1280, 128, 320, 64, 64, 64, 1280] + - [104, 9507.0] + - - [64, 1280, 128, 192, 64, 64, 64, 1280] + - [61, 8134.0] + - - [256, 4096, 1, 6400, 256, 256, 256, 4096] + - [92, 10681.0] + - - [512, 2048, 1, 3427, 512, 512, 512, 2048] + - [92, 10631.0] + - - [512, 2048, 1, 3552, 512, 512, 512, 2048] + - [106, 10646.0] + - - [512, 2048, 1, 3840, 512, 512, 512, 2048] + - [92, 10670.0] + - - [2048, 512, 1, 3427, 2048, 2048, 2048, 512] + - [73, 10664.0] + - - [2048, 512, 1, 3452, 2048, 2048, 2048, 512] + - [92, 10626.0] + - - [2048, 512, 1, 3472, 2048, 2048, 2048, 512] + - [55, 10633.0] + - - [2048, 512, 1, 3475, 2048, 2048, 2048, 512] + - [106, 10625.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [105, 7128.0] + - - [64, 64, 496, 65, 64, 64, 64, 64] + - [91, 7419.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [49, 4924.0] + - - [64, 71, 448, 71, 64, 64, 64, 71] + - [49, 5298.0] + - - [64, 77, 408, 77, 64, 64, 64, 77] + - [71, 5721.0] + - - [64, 77, 408, 78, 64, 64, 64, 77] + - [49, 5804.0] + - - [64, 78, 408, 78, 64, 64, 64, 78] + - [49, 5773.0] + - - [64, 85, 376, 85, 64, 64, 64, 85] + - [71, 6071.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [91, 6677.0] + - - [64, 112, 288, 112, 64, 64, 64, 112] + - [49, 7881.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [91, 8311.0] + - - [64, 123, 264, 122, 64, 64, 64, 123] + - [49, 7998.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [49, 8375.0] + - - [64, 134, 240, 134, 64, 64, 64, 134] + - [91, 7105.0] + - - [64, 135, 240, 134, 64, 64, 64, 135] + - [49, 7154.0] + - - [64, 135, 240, 135, 64, 64, 64, 135] + - [49, 7200.0] + - - [64, 1280, 64, 192, 64, 64, 64, 1280] + - [71, 11333.0] + - - [64, 1280, 64, 320, 64, 64, 64, 1280] + - [49, 10719.0] + - - [64, 1280, 64, 384, 64, 64, 64, 1280] + - [71, 9894.0] + - - [64, 1280, 64, 448, 64, 64, 64, 1280] + - [49, 9790.0] + - - [64, 2048, 64, 192, 64, 64, 64, 2048] + - [61, 9048.0] + - - [64, 2048, 64, 320, 64, 64, 64, 2048] + - [61, 9510.0] + - - [64, 2048, 64, 384, 64, 64, 64, 2048] + - [49, 10092.0] + - - [64, 2048, 64, 448, 64, 64, 64, 2048] + - [71, 9991.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [61, 10425.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [55, 8927.0] + - - [5329, 64, 64, 80, 5329, 5329, 5329, 64] + - [47, 5999.0] + - - [257, 4096, 1, 1024, 257, 257, 257, 4096] + - [49, 8914.0] + - - [512, 2048, 1, 2790, 512, 512, 512, 2048] + - [92, 10653.0] + - - [512, 2048, 1, 2864, 512, 512, 512, 2048] + - [51, 10645.0] + - - [512, 2048, 1, 3092, 512, 512, 512, 2048] + - [92, 10636.0] + - - [512, 2048, 1, 3113, 512, 512, 512, 2048] + - [92, 10663.0] + - - [512, 2048, 1, 3137, 512, 512, 512, 2048] + - [92, 10628.0] + - - [512, 2048, 1, 3165, 512, 512, 512, 2048] + - [73, 10621.0] + - - [512, 2048, 1, 3166, 512, 512, 512, 2048] + - [63, 10632.0] + - - [512, 2048, 1, 3194, 512, 512, 512, 2048] + - [92, 10636.0] + - - [512, 2048, 1, 3219, 512, 512, 512, 2048] + - [92, 10681.0] + - - [512, 2048, 1, 3222, 512, 512, 512, 2048] + - [92, 10646.0] + - - [512, 2048, 1, 3234, 512, 512, 512, 2048] + - [92, 10630.0] + - - [512, 2048, 1, 3237, 512, 512, 512, 2048] + - [51, 10638.0] + - - [512, 2048, 1, 3242, 512, 512, 512, 2048] + - [92, 10633.0] + - - [512, 2048, 1, 3246, 512, 512, 512, 2048] + - [92, 10642.0] + - - [512, 2048, 1, 3249, 512, 512, 512, 2048] + - [86, 10604.0] + - - [512, 2048, 1, 3251, 512, 512, 512, 2048] + - [92, 10644.0] + - - [512, 2048, 1, 3257, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3262, 512, 512, 512, 2048] + - [92, 10660.0] + - - [512, 2048, 1, 3268, 512, 512, 512, 2048] + - [63, 10667.0] + - - [512, 2048, 1, 3282, 512, 512, 512, 2048] + - [92, 10690.0] + - - [512, 2048, 1, 3286, 512, 512, 512, 2048] + - [73, 10646.0] + - - [512, 2048, 1, 3287, 512, 512, 512, 2048] + - [92, 10657.0] + - - [512, 2048, 1, 3293, 512, 512, 512, 2048] + - [106, 10642.0] + - - [512, 2048, 1, 3297, 512, 512, 512, 2048] + - [92, 10652.0] + - - [512, 2048, 1, 3307, 512, 512, 512, 2048] + - [92, 10627.0] + - - [512, 2048, 1, 3314, 512, 512, 512, 2048] + - [73, 10638.0] + - - [512, 2048, 1, 3315, 512, 512, 512, 2048] + - [92, 10638.0] + - - [512, 2048, 1, 3319, 512, 512, 512, 2048] + - [92, 10681.0] + - - [512, 2048, 1, 3322, 512, 512, 512, 2048] + - [92, 10651.0] + - - [512, 2048, 1, 3323, 512, 512, 512, 2048] + - [73, 10643.0] + - - [512, 2048, 1, 3324, 512, 512, 512, 2048] + - [92, 10653.0] + - - [512, 2048, 1, 3325, 512, 512, 512, 2048] + - [92, 10637.0] + - - [512, 2048, 1, 3327, 512, 512, 512, 2048] + - [73, 10677.0] + - - [512, 2048, 1, 3329, 512, 512, 512, 2048] + - [92, 10649.0] + - - [512, 2048, 1, 3332, 512, 512, 512, 2048] + - [92, 10670.0] + - - [512, 2048, 1, 3336, 512, 512, 512, 2048] + - [106, 10658.0] + - - [512, 2048, 1, 3339, 512, 512, 512, 2048] + - [106, 10635.0] + - - [512, 2048, 1, 3342, 512, 512, 512, 2048] + - [51, 10635.0] + - - [512, 2048, 1, 3344, 512, 512, 512, 2048] + - [63, 10648.0] + - - [512, 2048, 1, 3358, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3360, 512, 512, 512, 2048] + - [86, 10626.0] + - - [512, 2048, 1, 3364, 512, 512, 512, 2048] + - [92, 10656.0] + - - [512, 2048, 1, 3365, 512, 512, 512, 2048] + - [92, 10640.0] + - - [512, 2048, 1, 3369, 512, 512, 512, 2048] + - [92, 10631.0] + - - [512, 2048, 1, 3371, 512, 512, 512, 2048] + - [51, 10645.0] + - - [512, 2048, 1, 3374, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3376, 512, 512, 512, 2048] + - [73, 10645.0] + - - [512, 2048, 1, 3377, 512, 512, 512, 2048] + - [92, 10654.0] + - - [512, 2048, 1, 3378, 512, 512, 512, 2048] + - [92, 10675.0] + - - [512, 2048, 1, 3381, 512, 512, 512, 2048] + - [92, 10656.0] + - - [512, 2048, 1, 3382, 512, 512, 512, 2048] + - [73, 10654.0] + - - [512, 2048, 1, 3383, 512, 512, 512, 2048] + - [73, 10655.0] + - - [512, 2048, 1, 3384, 512, 512, 512, 2048] + - [92, 10647.0] + - - [512, 2048, 1, 3385, 512, 512, 512, 2048] + - [63, 10632.0] + - - [512, 2048, 1, 3386, 512, 512, 512, 2048] + - [51, 10635.0] + - - [512, 2048, 1, 3388, 512, 512, 512, 2048] + - [86, 10630.0] + - - [512, 2048, 1, 3390, 512, 512, 512, 2048] + - [92, 10675.0] + - - [512, 2048, 1, 3391, 512, 512, 512, 2048] + - [106, 10655.0] + - - [512, 2048, 1, 3396, 512, 512, 512, 2048] + - [92, 10644.0] + - - [512, 2048, 1, 3399, 512, 512, 512, 2048] + - [51, 10640.0] + - - [512, 2048, 1, 3402, 512, 512, 512, 2048] + - [92, 10633.0] + - - [512, 2048, 1, 3410, 512, 512, 512, 2048] + - [92, 10643.0] + - - [512, 2048, 1, 3412, 512, 512, 512, 2048] + - [92, 10637.0] + - - [512, 2048, 1, 3414, 512, 512, 512, 2048] + - [92, 10637.0] + - - [512, 2048, 1, 3415, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3418, 512, 512, 512, 2048] + - [92, 10640.0] + - - [512, 2048, 1, 3420, 512, 512, 512, 2048] + - [92, 10628.0] + - - [512, 2048, 1, 3422, 512, 512, 512, 2048] + - [92, 10649.0] + - - [512, 2048, 1, 3425, 512, 512, 512, 2048] + - [92, 10633.0] + - - [512, 2048, 1, 3426, 512, 512, 512, 2048] + - [73, 10661.0] + - - [512, 2048, 1, 3428, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3430, 512, 512, 512, 2048] + - [92, 10687.0] + - - [512, 2048, 1, 3431, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3432, 512, 512, 512, 2048] + - [106, 10658.0] + - - [512, 2048, 1, 3438, 512, 512, 512, 2048] + - [92, 10652.0] + - - [512, 2048, 1, 3439, 512, 512, 512, 2048] + - [92, 10646.0] + - - [512, 2048, 1, 3440, 512, 512, 512, 2048] + - [92, 10674.0] + - - [512, 2048, 1, 3443, 512, 512, 512, 2048] + - [73, 10651.0] + - - [512, 2048, 1, 3445, 512, 512, 512, 2048] + - [92, 10651.0] + - - [512, 2048, 1, 3447, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3448, 512, 512, 512, 2048] + - [92, 10668.0] + - - [512, 2048, 1, 3450, 512, 512, 512, 2048] + - [92, 10656.0] + - - [512, 2048, 1, 3451, 512, 512, 512, 2048] + - [73, 10647.0] + - - [512, 2048, 1, 3452, 512, 512, 512, 2048] + - [92, 10660.0] + - - [512, 2048, 1, 3453, 512, 512, 512, 2048] + - [92, 10652.0] + - - [512, 2048, 1, 3455, 512, 512, 512, 2048] + - [92, 10651.0] + - - [512, 2048, 1, 3456, 512, 512, 512, 2048] + - [92, 10675.0] + - - [512, 2048, 1, 3457, 512, 512, 512, 2048] + - [51, 10685.0] + - - [512, 2048, 1, 3458, 512, 512, 512, 2048] + - [92, 10709.0] + - - [512, 2048, 1, 3459, 512, 512, 512, 2048] + - [106, 10677.0] + - - [512, 2048, 1, 3460, 512, 512, 512, 2048] + - [92, 10638.0] + - - [512, 2048, 1, 3461, 512, 512, 512, 2048] + - [92, 10676.0] + - - [512, 2048, 1, 3462, 512, 512, 512, 2048] + - [92, 10631.0] + - - [512, 2048, 1, 3466, 512, 512, 512, 2048] + - [92, 10650.0] + - - [512, 2048, 1, 3467, 512, 512, 512, 2048] + - [73, 10665.0] + - - [512, 2048, 1, 3468, 512, 512, 512, 2048] + - [92, 10637.0] + - - [512, 2048, 1, 3470, 512, 512, 512, 2048] + - [63, 10625.0] + - - [512, 2048, 1, 3471, 512, 512, 512, 2048] + - [92, 10681.0] + - - [512, 2048, 1, 3472, 512, 512, 512, 2048] + - [92, 10672.0] + - - [512, 2048, 1, 3475, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3476, 512, 512, 512, 2048] + - [92, 10666.0] + - - [512, 2048, 1, 3477, 512, 512, 512, 2048] + - [92, 10649.0] + - - [512, 2048, 1, 3478, 512, 512, 512, 2048] + - [92, 10650.0] + - - [512, 2048, 1, 3479, 512, 512, 512, 2048] + - [73, 10644.0] + - - [512, 2048, 1, 3480, 512, 512, 512, 2048] + - [92, 10649.0] + - - [512, 2048, 1, 3481, 512, 512, 512, 2048] + - [92, 10642.0] + - - [512, 2048, 1, 3483, 512, 512, 512, 2048] + - [92, 10678.0] + - - [512, 2048, 1, 3484, 512, 512, 512, 2048] + - [73, 10663.0] + - - [512, 2048, 1, 3487, 512, 512, 512, 2048] + - [63, 10629.0] + - - [512, 2048, 1, 3489, 512, 512, 512, 2048] + - [51, 10688.0] + - - [512, 2048, 1, 3490, 512, 512, 512, 2048] + - [92, 10663.0] + - - [512, 2048, 1, 3491, 512, 512, 512, 2048] + - [92, 10641.0] + - - [512, 2048, 1, 3493, 512, 512, 512, 2048] + - [73, 10677.0] + - - [512, 2048, 1, 3494, 512, 512, 512, 2048] + - [92, 10639.0] + - - [512, 2048, 1, 3495, 512, 512, 512, 2048] + - [73, 10624.0] + - - [512, 2048, 1, 3497, 512, 512, 512, 2048] + - [92, 10667.0] + - - [512, 2048, 1, 3498, 512, 512, 512, 2048] + - [92, 10643.0] + - - [512, 2048, 1, 3499, 512, 512, 512, 2048] + - [92, 10650.0] + - - [512, 2048, 1, 3501, 512, 512, 512, 2048] + - [92, 10646.0] + - - [512, 2048, 1, 3503, 512, 512, 512, 2048] + - [92, 10652.0] + - - [512, 2048, 1, 3507, 512, 512, 512, 2048] + - [92, 10645.0] + - - [512, 2048, 1, 3508, 512, 512, 512, 2048] + - [73, 10669.0] + - - [512, 2048, 1, 3509, 512, 512, 512, 2048] + - [92, 10650.0] + - - [512, 2048, 1, 3511, 512, 512, 512, 2048] + - [51, 10652.0] + - - [512, 2048, 1, 3514, 512, 512, 512, 2048] + - [92, 10679.0] + - - [512, 2048, 1, 3515, 512, 512, 512, 2048] + - [106, 10650.0] + - - [512, 2048, 1, 3517, 512, 512, 512, 2048] + - [92, 10643.0] + - - [512, 2048, 1, 3518, 512, 512, 512, 2048] + - [92, 10639.0] + - - [512, 2048, 1, 3519, 512, 512, 512, 2048] + - [92, 10641.0] + - - [512, 2048, 1, 3520, 512, 512, 512, 2048] + - [92, 10672.0] + - - [512, 2048, 1, 3523, 512, 512, 512, 2048] + - [51, 10640.0] + - - [512, 2048, 1, 3528, 512, 512, 512, 2048] + - [92, 10665.0] + - - [512, 2048, 1, 3529, 512, 512, 512, 2048] + - [92, 10629.0] + - - [512, 2048, 1, 3530, 512, 512, 512, 2048] + - [92, 10643.0] + - - [512, 2048, 1, 3532, 512, 512, 512, 2048] + - [92, 10684.0] + - - [512, 2048, 1, 3533, 512, 512, 512, 2048] + - [92, 10653.0] + - - [512, 2048, 1, 3534, 512, 512, 512, 2048] + - [92, 10644.0] + - - [512, 2048, 1, 3538, 512, 512, 512, 2048] + - [92, 10654.0] + - - [512, 2048, 1, 3539, 512, 512, 512, 2048] + - [92, 10661.0] + - - [512, 2048, 1, 3541, 512, 512, 512, 2048] + - [63, 10664.0] + - - [512, 2048, 1, 3547, 512, 512, 512, 2048] + - [73, 10641.0] + - - [512, 2048, 1, 3548, 512, 512, 512, 2048] + - [73, 10685.0] + - - [512, 2048, 1, 3564, 512, 512, 512, 2048] + - [92, 10630.0] + - - [512, 2048, 1, 3575, 512, 512, 512, 2048] + - [92, 10641.0] + - - [512, 2048, 1, 3598, 512, 512, 512, 2048] + - [92, 10668.0] + - - [512, 2048, 1, 3599, 512, 512, 512, 2048] + - [92, 10655.0] + - - [512, 2048, 1, 3608, 512, 512, 512, 2048] + - [92, 10643.0] + - - [512, 2048, 1, 3780, 512, 512, 512, 2048] + - [51, 10670.0] + - - [512, 2048, 1, 3796, 512, 512, 512, 2048] + - [63, 10638.0] + - - [512, 2048, 1, 3822, 512, 512, 512, 2048] + - [73, 10645.0] + - - [512, 2048, 1, 3859, 512, 512, 512, 2048] + - [92, 10668.0] + - - [512, 2048, 1, 3870, 512, 512, 512, 2048] + - [92, 10668.0] + - - [512, 2048, 1, 3876, 512, 512, 512, 2048] + - [92, 10677.0] + - - [512, 2048, 1, 3906, 512, 512, 512, 2048] + - [92, 10708.0] + - - [512, 2048, 1, 3910, 512, 512, 512, 2048] + - [73, 10690.0] + - - [512, 2048, 1, 3925, 512, 512, 512, 2048] + - [92, 10680.0] + - - [512, 2048, 1, 3942, 512, 512, 512, 2048] + - [51, 10662.0] + - - [512, 2048, 1, 3944, 512, 512, 512, 2048] + - [92, 10681.0] + - - [512, 2048, 1, 3955, 512, 512, 512, 2048] + - [92, 10651.0] + - - [512, 2048, 1, 3968, 512, 512, 512, 2048] + - [92, 10672.0] + - - [512, 2048, 1, 3969, 512, 512, 512, 2048] + - [92, 10670.0] + - - [512, 2048, 1, 3976, 512, 512, 512, 2048] + - [92, 10649.0] + - - [512, 2048, 1, 3977, 512, 512, 512, 2048] + - [92, 10656.0] + - - [512, 2048, 1, 3978, 512, 512, 512, 2048] + - [92, 10669.0] + - - [512, 2048, 1, 3990, 512, 512, 512, 2048] + - [92, 10677.0] + - - [512, 2048, 1, 3995, 512, 512, 512, 2048] + - [92, 10668.0] + - - [512, 2048, 1, 3996, 512, 512, 512, 2048] + - [73, 10678.0] + - - [512, 2048, 1, 3999, 512, 512, 512, 2048] + - [92, 10666.0] + - - [512, 2048, 1, 4005, 512, 512, 512, 2048] + - [92, 10669.0] + - - [512, 2048, 1, 4012, 512, 512, 512, 2048] + - [92, 10663.0] + - - [512, 2048, 1, 4020, 512, 512, 512, 2048] + - [92, 10667.0] + - - [512, 2048, 1, 4026, 512, 512, 512, 2048] + - [106, 10650.0] + - - [512, 2048, 1, 4030, 512, 512, 512, 2048] + - [92, 10653.0] + - - [512, 2048, 1, 4032, 512, 512, 512, 2048] + - [92, 10685.0] + - - [2048, 512, 1, 2790, 2048, 2048, 2048, 512] + - [106, 10629.0] + - - [2048, 512, 1, 2864, 2048, 2048, 2048, 512] + - [55, 10616.0] + - - [2048, 512, 1, 3092, 2048, 2048, 2048, 512] + - [73, 10642.0] + - - [2048, 512, 1, 3113, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3137, 2048, 2048, 2048, 512] + - [92, 10658.0] + - - [2048, 512, 1, 3165, 2048, 2048, 2048, 512] + - [92, 10636.0] + - - [2048, 512, 1, 3166, 2048, 2048, 2048, 512] + - [92, 10626.0] + - - [2048, 512, 1, 3194, 2048, 2048, 2048, 512] + - [92, 10635.0] + - - [2048, 512, 1, 3219, 2048, 2048, 2048, 512] + - [73, 10630.0] + - - [2048, 512, 1, 3222, 2048, 2048, 2048, 512] + - [92, 10644.0] + - - [2048, 512, 1, 3234, 2048, 2048, 2048, 512] + - [106, 10645.0] + - - [2048, 512, 1, 3237, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3242, 2048, 2048, 2048, 512] + - [92, 10638.0] + - - [2048, 512, 1, 3246, 2048, 2048, 2048, 512] + - [92, 10646.0] + - - [2048, 512, 1, 3249, 2048, 2048, 2048, 512] + - [73, 10636.0] + - - [2048, 512, 1, 3251, 2048, 2048, 2048, 512] + - [86, 10630.0] + - - [2048, 512, 1, 3257, 2048, 2048, 2048, 512] + - [92, 10665.0] + - - [2048, 512, 1, 3262, 2048, 2048, 2048, 512] + - [92, 10638.0] + - - [2048, 512, 1, 3268, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3282, 2048, 2048, 2048, 512] + - [92, 10635.0] + - - [2048, 512, 1, 3286, 2048, 2048, 2048, 512] + - [73, 10642.0] + - - [2048, 512, 1, 3287, 2048, 2048, 2048, 512] + - [86, 10629.0] + - - [2048, 512, 1, 3293, 2048, 2048, 2048, 512] + - [92, 10671.0] + - - [2048, 512, 1, 3297, 2048, 2048, 2048, 512] + - [92, 10644.0] + - - [2048, 512, 1, 3307, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3314, 2048, 2048, 2048, 512] + - [92, 10625.0] + - - [2048, 512, 1, 3315, 2048, 2048, 2048, 512] + - [73, 10639.0] + - - [2048, 512, 1, 3319, 2048, 2048, 2048, 512] + - [92, 10649.0] + - - [2048, 512, 1, 3322, 2048, 2048, 2048, 512] + - [92, 10638.0] + - - [2048, 512, 1, 3323, 2048, 2048, 2048, 512] + - [92, 10650.0] + - - [2048, 512, 1, 3324, 2048, 2048, 2048, 512] + - [92, 10643.0] + - - [2048, 512, 1, 3325, 2048, 2048, 2048, 512] + - [92, 10632.0] + - - [2048, 512, 1, 3327, 2048, 2048, 2048, 512] + - [92, 10628.0] + - - [2048, 512, 1, 3329, 2048, 2048, 2048, 512] + - [92, 10653.0] + - - [2048, 512, 1, 3332, 2048, 2048, 2048, 512] + - [92, 10655.0] + - - [2048, 512, 1, 3336, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3339, 2048, 2048, 2048, 512] + - [92, 10669.0] + - - [2048, 512, 1, 3342, 2048, 2048, 2048, 512] + - [92, 10643.0] + - - [2048, 512, 1, 3344, 2048, 2048, 2048, 512] + - [92, 10641.0] + - - [2048, 512, 1, 3358, 2048, 2048, 2048, 512] + - [92, 10641.0] + - - [2048, 512, 1, 3360, 2048, 2048, 2048, 512] + - [92, 10665.0] + - - [2048, 512, 1, 3364, 2048, 2048, 2048, 512] + - [73, 10654.0] + - - [2048, 512, 1, 3365, 2048, 2048, 2048, 512] + - [86, 10637.0] + - - [2048, 512, 1, 3369, 2048, 2048, 2048, 512] + - [92, 10680.0] + - - [2048, 512, 1, 3371, 2048, 2048, 2048, 512] + - [106, 10635.0] + - - [2048, 512, 1, 3374, 2048, 2048, 2048, 512] + - [92, 10650.0] + - - [2048, 512, 1, 3376, 2048, 2048, 2048, 512] + - [73, 10679.0] + - - [2048, 512, 1, 3377, 2048, 2048, 2048, 512] + - [73, 10650.0] + - - [2048, 512, 1, 3378, 2048, 2048, 2048, 512] + - [92, 10645.0] + - - [2048, 512, 1, 3381, 2048, 2048, 2048, 512] + - [73, 10670.0] + - - [2048, 512, 1, 3382, 2048, 2048, 2048, 512] + - [86, 10649.0] + - - [2048, 512, 1, 3383, 2048, 2048, 2048, 512] + - [92, 10673.0] + - - [2048, 512, 1, 3384, 2048, 2048, 2048, 512] + - [106, 10655.0] + - - [2048, 512, 1, 3385, 2048, 2048, 2048, 512] + - [92, 10647.0] + - - [2048, 512, 1, 3386, 2048, 2048, 2048, 512] + - [92, 10643.0] + - - [2048, 512, 1, 3388, 2048, 2048, 2048, 512] + - [106, 10659.0] + - - [2048, 512, 1, 3390, 2048, 2048, 2048, 512] + - [92, 10652.0] + - - [2048, 512, 1, 3391, 2048, 2048, 2048, 512] + - [92, 10633.0] + - - [2048, 512, 1, 3396, 2048, 2048, 2048, 512] + - [86, 10642.0] + - - [2048, 512, 1, 3399, 2048, 2048, 2048, 512] + - [92, 10651.0] + - - [2048, 512, 1, 3402, 2048, 2048, 2048, 512] + - [106, 10657.0] + - - [2048, 512, 1, 3410, 2048, 2048, 2048, 512] + - [92, 10639.0] + - - [2048, 512, 1, 3412, 2048, 2048, 2048, 512] + - [51, 10635.0] + - - [2048, 512, 1, 3414, 2048, 2048, 2048, 512] + - [92, 10656.0] + - - [2048, 512, 1, 3415, 2048, 2048, 2048, 512] + - [92, 10642.0] + - - [2048, 512, 1, 3418, 2048, 2048, 2048, 512] + - [73, 10633.0] + - - [2048, 512, 1, 3420, 2048, 2048, 2048, 512] + - [92, 10655.0] + - - [2048, 512, 1, 3422, 2048, 2048, 2048, 512] + - [92, 10679.0] + - - [2048, 512, 1, 3425, 2048, 2048, 2048, 512] + - [106, 10651.0] + - - [2048, 512, 1, 3426, 2048, 2048, 2048, 512] + - [92, 10635.0] + - - [2048, 512, 1, 3428, 2048, 2048, 2048, 512] + - [92, 10621.0] + - - [2048, 512, 1, 3430, 2048, 2048, 2048, 512] + - [92, 10637.0] + - - [2048, 512, 1, 3431, 2048, 2048, 2048, 512] + - [73, 10635.0] + - - [2048, 512, 1, 3432, 2048, 2048, 2048, 512] + - [92, 10646.0] + - - [2048, 512, 1, 3438, 2048, 2048, 2048, 512] + - [92, 10634.0] + - - [2048, 512, 1, 3439, 2048, 2048, 2048, 512] + - [73, 10620.0] + - - [2048, 512, 1, 3440, 2048, 2048, 2048, 512] + - [92, 10662.0] + - - [2048, 512, 1, 3443, 2048, 2048, 2048, 512] + - [92, 10661.0] + - - [2048, 512, 1, 3445, 2048, 2048, 2048, 512] + - [92, 10629.0] + - - [2048, 512, 1, 3447, 2048, 2048, 2048, 512] + - [92, 10650.0] + - - [2048, 512, 1, 3448, 2048, 2048, 2048, 512] + - [92, 10644.0] + - - [2048, 512, 1, 3450, 2048, 2048, 2048, 512] + - [92, 10639.0] + - - [2048, 512, 1, 3451, 2048, 2048, 2048, 512] + - [73, 10637.0] + - - [2048, 512, 1, 3453, 2048, 2048, 2048, 512] + - [86, 10651.0] + - - [2048, 512, 1, 3455, 2048, 2048, 2048, 512] + - [73, 10629.0] + - - [2048, 512, 1, 3456, 2048, 2048, 2048, 512] + - [92, 10687.0] + - - [2048, 512, 1, 3457, 2048, 2048, 2048, 512] + - [92, 10653.0] + - - [2048, 512, 1, 3458, 2048, 2048, 2048, 512] + - [92, 10637.0] + - - [2048, 512, 1, 3459, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3460, 2048, 2048, 2048, 512] + - [73, 10650.0] + - - [2048, 512, 1, 3461, 2048, 2048, 2048, 512] + - [106, 10632.0] + - - [2048, 512, 1, 3462, 2048, 2048, 2048, 512] + - [73, 10666.0] + - - [2048, 512, 1, 3466, 2048, 2048, 2048, 512] + - [92, 10656.0] + - - [2048, 512, 1, 3467, 2048, 2048, 2048, 512] + - [73, 10644.0] + - - [2048, 512, 1, 3468, 2048, 2048, 2048, 512] + - [92, 10677.0] + - - [2048, 512, 1, 3470, 2048, 2048, 2048, 512] + - [106, 10658.0] + - - [2048, 512, 1, 3471, 2048, 2048, 2048, 512] + - [92, 10647.0] + - - [2048, 512, 1, 3476, 2048, 2048, 2048, 512] + - [92, 10638.0] + - - [2048, 512, 1, 3477, 2048, 2048, 2048, 512] + - [106, 10617.0] + - - [2048, 512, 1, 3478, 2048, 2048, 2048, 512] + - [92, 10649.0] + - - [2048, 512, 1, 3479, 2048, 2048, 2048, 512] + - [73, 10657.0] + - - [2048, 512, 1, 3480, 2048, 2048, 2048, 512] + - [73, 10637.0] + - - [2048, 512, 1, 3481, 2048, 2048, 2048, 512] + - [106, 10631.0] + - - [2048, 512, 1, 3483, 2048, 2048, 2048, 512] + - [92, 10660.0] + - - [2048, 512, 1, 3484, 2048, 2048, 2048, 512] + - [92, 10660.0] + - - [2048, 512, 1, 3487, 2048, 2048, 2048, 512] + - [106, 10647.0] + - - [2048, 512, 1, 3489, 2048, 2048, 2048, 512] + - [92, 10628.0] + - - [2048, 512, 1, 3490, 2048, 2048, 2048, 512] + - [92, 10639.0] + - - [2048, 512, 1, 3491, 2048, 2048, 2048, 512] + - [92, 10652.0] + - - [2048, 512, 1, 3493, 2048, 2048, 2048, 512] + - [92, 10646.0] + - - [2048, 512, 1, 3494, 2048, 2048, 2048, 512] + - [73, 10637.0] + - - [2048, 512, 1, 3495, 2048, 2048, 2048, 512] + - [106, 10669.0] + - - [2048, 512, 1, 3497, 2048, 2048, 2048, 512] + - [73, 10664.0] + - - [2048, 512, 1, 3498, 2048, 2048, 2048, 512] + - [92, 10669.0] + - - [2048, 512, 1, 3499, 2048, 2048, 2048, 512] + - [92, 10655.0] + - - [2048, 512, 1, 3501, 2048, 2048, 2048, 512] + - [92, 10675.0] + - - [2048, 512, 1, 3503, 2048, 2048, 2048, 512] + - [92, 10657.0] + - - [2048, 512, 1, 3507, 2048, 2048, 2048, 512] + - [51, 10654.0] + - - [2048, 512, 1, 3508, 2048, 2048, 2048, 512] + - [92, 10649.0] + - - [2048, 512, 1, 3509, 2048, 2048, 2048, 512] + - [73, 10645.0] + - - [2048, 512, 1, 3511, 2048, 2048, 2048, 512] + - [92, 10637.0] + - - [2048, 512, 1, 3514, 2048, 2048, 2048, 512] + - [92, 10635.0] + - - [2048, 512, 1, 3515, 2048, 2048, 2048, 512] + - [92, 10654.0] + - - [2048, 512, 1, 3517, 2048, 2048, 2048, 512] + - [73, 10630.0] + - - [2048, 512, 1, 3518, 2048, 2048, 2048, 512] + - [92, 10650.0] + - - [2048, 512, 1, 3519, 2048, 2048, 2048, 512] + - [92, 10644.0] + - - [2048, 512, 1, 3520, 2048, 2048, 2048, 512] + - [92, 10644.0] + - - [2048, 512, 1, 3523, 2048, 2048, 2048, 512] + - [92, 10640.0] + - - [2048, 512, 1, 3528, 2048, 2048, 2048, 512] + - [92, 10649.0] + - - [2048, 512, 1, 3529, 2048, 2048, 2048, 512] + - [92, 10638.0] + - - [2048, 512, 1, 3530, 2048, 2048, 2048, 512] + - [106, 10630.0] + - - [2048, 512, 1, 3532, 2048, 2048, 2048, 512] + - [73, 10645.0] + - - [2048, 512, 1, 3533, 2048, 2048, 2048, 512] + - [73, 10644.0] + - - [2048, 512, 1, 3534, 2048, 2048, 2048, 512] + - [92, 10650.0] + - - [2048, 512, 1, 3538, 2048, 2048, 2048, 512] + - [92, 10680.0] + - - [2048, 512, 1, 3539, 2048, 2048, 2048, 512] + - [92, 10632.0] + - - [2048, 512, 1, 3541, 2048, 2048, 2048, 512] + - [106, 10653.0] + - - [2048, 512, 1, 3547, 2048, 2048, 2048, 512] + - [92, 10653.0] + - - [2048, 512, 1, 3548, 2048, 2048, 2048, 512] + - [92, 10630.0] + - - [2048, 512, 1, 3552, 2048, 2048, 2048, 512] + - [92, 10647.0] + - - [2048, 512, 1, 3564, 2048, 2048, 2048, 512] + - [92, 10647.0] + - - [2048, 512, 1, 3575, 2048, 2048, 2048, 512] + - [92, 10618.0] + - - [2048, 512, 1, 3598, 2048, 2048, 2048, 512] + - [73, 10651.0] + - - [2048, 512, 1, 3599, 2048, 2048, 2048, 512] + - [73, 10640.0] + - - [2048, 512, 1, 3608, 2048, 2048, 2048, 512] + - [92, 10655.0] + - - [2048, 512, 1, 3780, 2048, 2048, 2048, 512] + - [92, 10644.0] + - - [2048, 512, 1, 3796, 2048, 2048, 2048, 512] + - [92, 10646.0] + - - [2048, 512, 1, 3822, 2048, 2048, 2048, 512] + - [106, 10633.0] + - - [2048, 512, 1, 3840, 2048, 2048, 2048, 512] + - [73, 10631.0] + - - [2048, 512, 1, 3859, 2048, 2048, 2048, 512] + - [92, 10663.0] + - - [2048, 512, 1, 3870, 2048, 2048, 2048, 512] + - [73, 10620.0] + - - [2048, 512, 1, 3876, 2048, 2048, 2048, 512] + - [92, 10645.0] + - - [2048, 512, 1, 3906, 2048, 2048, 2048, 512] + - [92, 10632.0] + - - [2048, 512, 1, 3910, 2048, 2048, 2048, 512] + - [106, 10646.0] + - - [2048, 512, 1, 3925, 2048, 2048, 2048, 512] + - [73, 10635.0] + - - [2048, 512, 1, 3942, 2048, 2048, 2048, 512] + - [86, 10647.0] + - - [2048, 512, 1, 3944, 2048, 2048, 2048, 512] + - [73, 10658.0] + - - [2048, 512, 1, 3955, 2048, 2048, 2048, 512] + - [73, 10664.0] + - - [2048, 512, 1, 3968, 2048, 2048, 2048, 512] + - [73, 10661.0] + - - [2048, 512, 1, 3969, 2048, 2048, 2048, 512] + - [73, 10654.0] + - - [2048, 512, 1, 3976, 2048, 2048, 2048, 512] + - [106, 10646.0] + - - [2048, 512, 1, 3977, 2048, 2048, 2048, 512] + - [92, 10651.0] + - - [2048, 512, 1, 3978, 2048, 2048, 2048, 512] + - [92, 10645.0] + - - [2048, 512, 1, 3990, 2048, 2048, 2048, 512] + - [92, 10658.0] + - - [2048, 512, 1, 3995, 2048, 2048, 2048, 512] + - [92, 10651.0] + - - [2048, 512, 1, 3996, 2048, 2048, 2048, 512] + - [73, 10680.0] + - - [2048, 512, 1, 3999, 2048, 2048, 2048, 512] + - [92, 10647.0] + - - [2048, 512, 1, 4005, 2048, 2048, 2048, 512] + - [92, 10660.0] + - - [2048, 512, 1, 4012, 2048, 2048, 2048, 512] + - [92, 10627.0] + - - [2048, 512, 1, 4020, 2048, 2048, 2048, 512] + - [92, 10652.0] + - - [2048, 512, 1, 4026, 2048, 2048, 2048, 512] + - [92, 10643.0] + - - [2048, 512, 1, 4030, 2048, 2048, 2048, 512] + - [92, 10649.0] + - - [2048, 512, 1, 4032, 2048, 2048, 2048, 512] + - [92, 10663.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [71, 7164.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [71, 8239.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [71, 10467.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 1024, 1024] + - [92, 10661.0] + - - [1024, 1024, 1, 3968, 1024, 1024, 1024, 1024] + - [73, 10687.0] + - - [1024, 1024, 1, 7200, 1024, 1024, 1024, 1024] + - [92, 10759.0] + - - [1024, 1024, 1, 8160, 1024, 1024, 1024, 1024] + - [92, 10728.0] + - - [768, 768, 1, 384, 768, 768, 768, 768] + - [107, 9260.0] + - - [768, 384, 1, 384, 768, 768, 768, 384] + - [51, 7683.0] + - - [1152, 576, 1, 384, 1152, 1152, 1152, 576] + - [49, 8841.0] + - - [384, 768, 1, 384, 384, 384, 384, 768] + - [51, 7641.0] + - - [1024, 1024, 1, 32, 1024, 1024, 1024, 1024] + - [70, 5146.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [49, 10890.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [49, 10941.0] + - - [1024, 1024, 1, 1600, 1024, 1024, 1024, 1024] + - [73, 10566.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 256] + - [93, 8990.0] + - - [256, 1280, 1, 8976, 256, 256, 256, 1280] + - [52, 10307.0] + - - [512, 2048, 1, 256, 512, 512, 512, 2048] + - [73, 10024.0] + - - [560, 1024, 1, 1600, 560, 560, 560, 1024] + - [57, 9834.0] + - - [560, 1024, 1, 200, 560, 560, 560, 1024] + - [106, 7987.0] + - - [1024, 1024, 1, 960, 1024, 1024, 1024, 1024] + - [92, 10501.0] + - - [2304, 128, 1, 128, 2304, 2304, 2304, 128] + - [49, 5720.0] + - - [2688, 128, 1, 128, 2688, 2688, 2688, 128] + - [50, 5734.0] + - - [3072, 128, 1, 128, 3072, 3072, 3072, 128] + - [49, 6339.0] + - - [3456, 128, 1, 128, 3456, 3456, 3456, 128] + - [49, 7025.0] + - - [3840, 128, 1, 128, 3840, 3840, 3840, 128] + - [49, 7635.0] + - - [4224, 128, 1, 128, 4224, 4224, 4224, 128] + - [51, 7316.0] + - - [4608, 128, 1, 128, 4608, 4608, 4608, 128] + - [51, 7848.0] + - - [4992, 128, 1, 128, 4992, 4992, 4992, 128] + - [106, 8195.0] + - - [5376, 128, 1, 128, 5376, 5376, 5376, 128] + - [51, 8022.0] + - - [5760, 128, 1, 128, 5760, 5760, 5760, 128] + - [51, 8456.0] + - - [6144, 128, 1, 128, 6144, 6144, 6144, 128] + - [63, 8769.0] + - - [6528, 128, 1, 128, 6528, 6528, 6528, 128] + - [49, 8502.0] + - - [6912, 128, 1, 128, 6912, 6912, 6912, 128] + - [51, 8988.0] + - - [7296, 128, 1, 128, 7296, 7296, 7296, 128] + - [51, 9266.0] + - - [7680, 128, 1, 128, 7680, 7680, 7680, 128] + - [49, 9533.0] + - - [8064, 128, 1, 128, 8064, 8064, 8064, 128] + - [51, 9150.0] + - - [8448, 128, 1, 128, 8448, 8448, 8448, 128] + - [51, 9454.0] + - - [8832, 128, 1, 128, 8832, 8832, 8832, 128] + - [51, 9777.0] + - - [2304, 128, 1, 256, 2304, 2304, 2304, 128] + - [106, 6705.0] + - - [2688, 128, 1, 256, 2688, 2688, 2688, 128] + - [51, 6573.0] + - - [3072, 128, 1, 256, 3072, 3072, 3072, 128] + - [51, 7369.0] + - - [3456, 128, 1, 256, 3456, 3456, 3456, 128] + - [51, 8230.0] + - - [3840, 128, 1, 256, 3840, 3840, 3840, 128] + - [51, 8937.0] + - - [4224, 128, 1, 256, 4224, 4224, 4224, 128] + - [73, 8239.0] + - - [4608, 128, 1, 256, 4608, 4608, 4608, 128] + - [51, 8810.0] + - - [4992, 128, 1, 256, 4992, 4992, 4992, 128] + - [52, 9588.0] + - - [5376, 128, 1, 256, 5376, 5376, 5376, 128] + - [51, 8764.0] + - - [5760, 128, 1, 256, 5760, 5760, 5760, 128] + - [92, 9362.0] + - - [6144, 128, 1, 256, 6144, 6144, 6144, 128] + - [51, 9811.0] + - - [6528, 128, 1, 256, 6528, 6528, 6528, 128] + - [49, 8876.0] + - - [6912, 128, 1, 256, 6912, 6912, 6912, 128] + - [73, 9638.0] + - - [7296, 128, 1, 256, 7296, 7296, 7296, 128] + - [98, 10226.0] + - - [7680, 128, 1, 256, 7680, 7680, 7680, 128] + - [51, 10521.0] + - - [8064, 128, 1, 256, 8064, 8064, 8064, 128] + - [73, 9874.0] + - - [8448, 128, 1, 256, 8448, 8448, 8448, 128] + - [51, 10200.0] + - - [8832, 128, 1, 256, 8832, 8832, 8832, 128] + - [51, 10648.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [74, 9899.0] + - - [384, 1536, 1, 384, 384, 384, 384, 1536] + - [93, 9437.0] + - - [384, 1920, 1, 384, 384, 384, 384, 1920] + - [73, 9282.0] + - - [384, 2304, 1, 384, 384, 384, 384, 2304] + - [79, 10111.0] + - - [64, 192, 64, 1280, 64, 64, 64, 192] + - [49, 10052.0] + - - [64, 320, 64, 1280, 64, 64, 64, 320] + - [71, 10734.0] + - - [64, 384, 64, 1280, 64, 64, 64, 384] + - [61, 9913.0] + - - [64, 448, 64, 1280, 64, 64, 64, 448] + - [71, 9895.0] + - - [64, 192, 64, 2048, 64, 64, 64, 192] + - [61, 9850.0] + - - [64, 320, 64, 2048, 64, 64, 64, 320] + - [71, 10211.0] + - - [64, 384, 64, 2048, 64, 64, 64, 384] + - [65, 8520.0] + - - [64, 448, 64, 2048, 64, 64, 64, 448] + - [71, 9728.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 64] + - [51, 10735.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 64] + - [92, 10783.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 64] + - [73, 10774.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 80] + - [96, 5077.0] + - - [64, 192, 32, 1280, 64, 64, 64, 192] + - [91, 8063.0] + - - [64, 320, 32, 1280, 64, 64, 64, 320] + - [53, 10562.0] + - - [64, 384, 32, 1280, 64, 64, 64, 384] + - [71, 10119.0] + - - [64, 448, 32, 1280, 64, 64, 64, 448] + - [71, 10021.0] + - - [64, 192, 32, 2048, 64, 64, 64, 192] + - [71, 8345.0] + - - [64, 320, 32, 2048, 64, 64, 64, 320] + - [53, 10578.0] + - - [64, 384, 32, 2048, 64, 64, 64, 384] + - [91, 9979.0] + - - [64, 448, 32, 2048, 64, 64, 64, 448] + - [71, 9954.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 64] + - [49, 10167.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 64] + - [51, 10602.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 64] + - [92, 10901.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 80] + - [71, 6380.0] + - - [289, 128, 32, 768, 289, 289, 289, 128] + - [49, 9903.0] + - - [289, 160, 32, 768, 289, 289, 289, 160] + - [71, 8201.0] + - - [289, 192, 32, 768, 289, 289, 289, 192] + - [91, 9828.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [71, 10513.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 64] + - [49, 10028.0] + - - [196, 256, 32, 1024, 196, 196, 196, 256] + - [92, 8806.0] + - - [1024, 1024, 1, 6912, 1024, 1024, 1024, 1024] + - [92, 10715.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 512] + - [102, 9361.0] + - - [480, 1024, 1, 4096, 480, 480, 480, 1024] + - [52, 8638.0] + - - [1024, 512, 1, 6912, 1024, 1024, 1024, 512] + - [59, 9435.0] + - - [480, 1024, 1, 6912, 480, 480, 480, 1024] + - [82, 8764.0] + - - [100, 512, 120, 128, 100, 100, 100, 512] + - [91, 8356.0] + - - [100, 512, 18, 128, 100, 100, 100, 512] + - [104, 6255.0] + - - [100, 512, 19, 128, 100, 100, 100, 512] + - [91, 6485.0] + - - [1444, 576, 1, 128, 1444, 1444, 1444, 576] + - [86, 8330.0] + - - [173280, 64, 1, 128, 173280, 173280, 173280, 64] + - [54, 7961.0] + - - [25992, 64, 1, 128, 25992, 25992, 25992, 64] + - [71, 9045.0] + - - [27436, 64, 1, 128, 27436, 27436, 27436, 64] + - [84, 9540.0] + - - [361, 2304, 1, 512, 361, 361, 361, 2304] + - [57, 9372.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [49, 10985.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 960] + - [92, 11275.0] + - - [1024, 1024, 1, 77, 1024, 1024, 1024, 1024] + - [49, 7518.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [49, 9720.0] + - - [1024, 1024, 1, 10, 1024, 1024, 1024, 1024] + - [49, 2158.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [49, 10909.0] + - - [1024, 1024, 1, 39, 1024, 1024, 1024, 1024] + - [49, 5190.0] + - - [1024, 1024, 1, 780, 1024, 1024, 1024, 1024] + - [92, 10443.0] + - - [1024, 1024, 1, 4992, 1024, 1024, 1024, 1024] + - [92, 10690.0] + - - [1024, 1024, 1, 308, 1024, 1024, 1024, 1024] + - [92, 10024.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [49, 10991.0] + - - [1024, 1024, 1, 40, 1024, 1024, 1024, 1024] + - [49, 5730.0] + - - [1024, 1024, 1, 800, 1024, 1024, 1024, 1024] + - [73, 10321.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 1024, 1024] + - [73, 10690.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [49, 10894.0] + - - [1024, 1024, 1, 41, 1024, 1024, 1024, 1024] + - [51, 5321.0] + - - [1024, 1024, 1, 820, 1024, 1024, 1024, 1024] + - [51, 10442.0] + - - [1024, 1024, 1, 5248, 1024, 1024, 1024, 1024] + - [92, 10698.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [71, 11047.0] + - - [1024, 1024, 1, 5, 1024, 1024, 1024, 1024] + - [49, 1070.0] + - - [1024, 1024, 1, 385, 1024, 1024, 1024, 1024] + - [51, 10225.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 1024, 1024] + - [73, 10648.0] + - - [1024, 1024, 1, 462, 1024, 1024, 1024, 1024] + - [51, 10272.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [49, 9008.0] + - - [1024, 1024, 1, 8, 1024, 1024, 1024, 1024] + - [83, 1848.0] + - - [1024, 1024, 1, 160, 1024, 1024, 1024, 1024] + - [86, 9425.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [49, 9118.0] + - - [1024, 1024, 1, 9, 1024, 1024, 1024, 1024] + - [55, 1774.0] + - - [1024, 1024, 1, 180, 1024, 1024, 1024, 1024] + - [51, 9689.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1024, 1024] + - [92, 10537.0] + - - [1024, 1024, 1, 6528, 1024, 1024, 1024, 1024] + - [73, 10706.0] + - - [1024, 1024, 1, 7104, 1024, 1024, 1024, 1024] + - [92, 10713.0] + - - [1024, 1024, 1, 8064, 1024, 1024, 1024, 1024] + - [92, 10721.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 512] + - [61, 228.0] + - - [1024, 1024, 1, 16, 1024, 1024, 1024, 1024] + - [83, 3142.0] + - - [512, 64, 256, 512, 512, 512, 512, 64] + - [92, 9180.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [49, 9201.0] + - - [512, 64, 128, 512, 512, 512, 512, 64] + - [55, 9062.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [91, 9141.0] + - - [512, 64, 40, 512, 512, 512, 512, 64] + - [73, 11041.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [71, 10856.0] + - - [1024, 96, 64, 1024, 1024, 1024, 1024, 96] + - [98, 9075.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [79, 9129.0] + - - [1024, 96, 128, 1024, 1024, 1024, 1024, 96] + - [79, 9166.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [110, 9233.0] + - - [1024, 64, 256, 1024, 1024, 1024, 1024, 64] + - [73, 10231.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [71, 10251.0] + - - [1024, 64, 32, 1024, 1024, 1024, 1024, 64] + - [106, 10751.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [91, 10621.0] + - - [1024, 64, 64, 1024, 1024, 1024, 1024, 64] + - [55, 10094.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [71, 10144.0] + - - [1024, 64, 128, 1024, 1024, 1024, 1024, 64] + - [51, 10220.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [91, 10274.0] + - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] + - [49, 6738.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [89, 6194.0] + - - [128, 64, 1024, 128, 128, 128, 128, 64] + - [58, 5548.0] + - - [1024, 1024, 1, 3456, 1024, 1024, 1024, 1024] + - [92, 10647.0] + - - [1024, 1024, 1, 864, 1024, 1024, 1024, 1024] + - [92, 10493.0] + - - [1024, 512, 1, 3456, 1024, 1024, 1024, 512] + - [82, 9328.0] + - - [1024, 512, 1, 864, 1024, 1024, 1024, 512] + - [93, 8900.0] + - - [256, 3456, 1, 1, 256, 256, 256, 3456] + - [90, 206.0] + - - [256, 4096, 1, 1, 256, 256, 256, 4096] + - [61, 234.0] + - - [480, 1024, 1, 3456, 480, 480, 480, 1024] + - [57, 8662.0] + - - [480, 1024, 1, 864, 480, 480, 480, 1024] + - [52, 8092.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [47, 5909.0] + - - [128, 64, 1280, 128, 128, 128, 128, 64] + - [80, 5980.0] + - - [1024, 1024, 1, 82, 1024, 1024, 1024, 1024] + - [49, 7299.0] + - - [128, 64, 1312, 128, 128, 128, 128, 64] + - [73, 5374.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [62, 6380.0] + - - [1024, 1024, 1, 12, 1024, 1024, 1024, 1024] + - [55, 2104.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 1024, 1024] + - [73, 10705.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [91, 9244.0] + - - [512, 64, 192, 512, 512, 512, 512, 64] + - [49, 9377.0] + - - [3136, 64, 64, 128, 3136, 3136, 3136, 64] + - [48, 7583.0] + - - [3136, 64, 32, 128, 3136, 3136, 3136, 64] + - [49, 11097.0] + - - [196, 2304, 1, 256, 196, 196, 196, 2304] + - [49, 6583.0] + - - [784, 1152, 1, 128, 784, 784, 784, 1152] + - [71, 8602.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [81, 5726.0] + - - [128, 64, 2048, 128, 128, 128, 128, 64] + - [76, 5370.0] + - - [128, 64, 1536, 128, 128, 128, 128, 64] + - [99, 6030.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [81, 5788.0] + - - [1024, 1024, 1, 96, 1024, 1024, 1024, 1024] + - [51, 8211.0] + - - [92416, 64, 25, 64, 92416, 92416, 92416, 64] + - [100, 5065.0] + - - [50176, 64, 36, 64, 50176, 50176, 50176, 64] + - [73, 5235.0] + - - [36864, 64, 49, 64, 36864, 36864, 36864, 64] + - [54, 5068.0] + - - [25600, 64, 64, 64, 25600, 25600, 25600, 64] + - [92, 5293.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [49, 8956.0] + - - [128, 64, 192, 128, 128, 128, 128, 64] + - [86, 9381.0] + - - [768, 768, 1, 2048, 768, 768, 768, 768] + - [102, 10288.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [91, 11077.0] + - - [384, 64, 144, 384, 384, 384, 384, 64] + - [106, 11250.0] + - - [768, 768, 1, 4608, 768, 768, 768, 768] + - [111, 10394.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [71, 10630.0] + - - [512, 64, 48, 512, 512, 512, 512, 64] + - [73, 10933.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [49, 10099.0] + - - [128, 64, 256, 128, 128, 128, 128, 64] + - [86, 10486.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [104, 8516.0] + - - [384, 64, 192, 384, 384, 384, 384, 64] + - [106, 8544.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 1024, 1024] + - [92, 10686.0] + - - [768, 512, 2, 2048, 768, 768, 768, 512] + - [92, 10716.0] + - - [713, 512, 2, 2048, 713, 713, 713, 512] + - [73, 9975.0] + - - [672, 512, 2, 2048, 672, 672, 672, 512] + - [51, 9438.0] + - - [660, 512, 2, 2048, 660, 660, 660, 512] + - [92, 9242.0] + - - [726, 512, 2, 2048, 726, 726, 726, 512] + - [51, 10138.0] + - - [1008, 512, 2, 2048, 1008, 1008, 1008, 512] + - [51, 10361.0] + - - [748, 512, 2, 2048, 748, 748, 748, 512] + - [92, 10422.0] + - - [864, 512, 2, 2048, 864, 864, 864, 512] + - [52, 10424.0] + - - [888, 512, 2, 2048, 888, 888, 888, 512] + - [98, 10785.0] + - - [805, 512, 2, 2048, 805, 805, 805, 512] + - [57, 9807.0] + - - [850, 512, 2, 2048, 850, 850, 850, 512] + - [98, 10340.0] + - - [840, 512, 2, 2048, 840, 840, 840, 512] + - [57, 10235.0] + - - [850, 256, 2, 3, 850, 850, 850, 256] + - [97, 484.0] + - - [805, 256, 2, 12, 805, 805, 805, 256] + - [56, 1606.0] + - - [805, 256, 2, 3, 805, 805, 805, 256] + - [97, 458.0] + - - [850, 256, 2, 12, 850, 850, 850, 256] + - [97, 1685.0] + - - [768, 256, 2, 12, 768, 768, 768, 256] + - [83, 1661.0] + - - [864, 256, 2, 3, 864, 864, 864, 256] + - [47, 553.0] + - - [950, 256, 2, 12, 950, 950, 950, 256] + - [56, 1687.0] + - - [864, 256, 2, 12, 864, 864, 864, 256] + - [47, 1910.0] + - - [950, 256, 2, 3, 950, 950, 950, 256] + - [67, 483.0] + - - [768, 256, 2, 3, 768, 768, 768, 256] + - [49, 447.0] + - - [1024, 320, 1, 1024, 1024, 1024, 1024, 320] + - [92, 9133.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [79, 9337.0] + - - [1024, 96, 160, 1024, 1024, 1024, 1024, 96] + - [98, 9331.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [57, 9155.0] + - - [1024, 96, 40, 1024, 1024, 1024, 1024, 96] + - [79, 9060.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [88, 9268.0] + - - [1024, 96, 80, 1024, 1024, 1024, 1024, 96] + - [57, 9229.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [57, 9173.0] + - - [1024, 96, 96, 1024, 1024, 1024, 1024, 96] + - [57, 9149.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [88, 8720.0] + - - [1024, 96, 24, 1024, 1024, 1024, 1024, 96] + - [98, 8638.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [92, 8881.0] + - - [1024, 96, 48, 1024, 1024, 1024, 1024, 96] + - [51, 8917.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [92, 8612.0] + - - [1024, 96, 16, 1024, 1024, 1024, 1024, 96] + - [73, 8615.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [98, 8964.0] + - - [1024, 96, 32, 1024, 1024, 1024, 1024, 96] + - [57, 8890.0] + - - [512, 64, 320, 512, 512, 512, 512, 64] + - [51, 9161.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [91, 9159.0] + - - [512, 64, 80, 512, 512, 512, 512, 64] + - [51, 11438.0] + - - [1024, 64, 512, 1024, 1024, 1024, 1024, 64] + - [96, 10253.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [71, 10252.0] + - - [64, 64, 64, 13216, 64, 64, 64, 64] + - [104, 5962.0] + - - [64, 96, 36, 10368, 64, 64, 64, 96] + - [108, 6548.0] + - - [64, 64, 36, 12544, 64, 64, 64, 64] + - [117, 6065.0] + - - [64, 64, 36, 11552, 64, 64, 64, 64] + - [121, 6174.0] + - - [1024, 256, 1, 10496, 1024, 1024, 1024, 256] + - [112, 10191.0] + - - [1024, 256, 1, 11520, 1024, 1024, 1024, 256] + - [112, 10238.0] + - - [1024, 256, 1, 12032, 1024, 1024, 1024, 256] + - [114, 10271.0] + - - [1024, 256, 1, 13568, 1024, 1024, 1024, 256] + - [114, 10339.0] + - - [1024, 256, 1, 14336, 1024, 1024, 1024, 256] + - [116, 10364.0] + - - [1024, 256, 1, 14848, 1024, 1024, 1024, 256] + - [114, 10368.0] + - - [1024, 256, 1, 15104, 1024, 1024, 1024, 256] + - [114, 10364.0] + - - [1024, 256, 1, 15872, 1024, 1024, 1024, 256] + - [119, 10383.0] + - - [1024, 256, 1, 16128, 1024, 1024, 1024, 256] + - [114, 10395.0] + - - [1024, 256, 1, 17152, 1024, 1024, 1024, 256] + - [120, 10369.0] + - - [1024, 256, 1, 17408, 1024, 1024, 1024, 256] + - [114, 10378.0] + - - [1024, 256, 1, 18944, 1024, 1024, 1024, 256] + - [120, 10396.0] + - - [1024, 256, 1, 19712, 1024, 1024, 1024, 256] + - [120, 10400.0] + - - [1024, 256, 1, 19968, 1024, 1024, 1024, 256] + - [113, 10419.0] + - - [1024, 256, 1, 8192, 1024, 1024, 1024, 256] + - [116, 10078.0] + - - [1024, 256, 1, 8448, 1024, 1024, 1024, 256] + - [119, 10084.0] + - - [1024, 256, 1, 9728, 1024, 1024, 1024, 256] + - [119, 10166.0] + - - [1024, 256, 1, 9984, 1024, 1024, 1024, 256] + - [119, 10184.0] + - - [512, 256, 1, 32768, 512, 512, 512, 256] + - [118, 9136.0] + - - [256, 128, 1, 55296, 256, 256, 256, 128] + - [115, 6609.0] + - - [512, 512, 1, 200, 512, 512, 512, 512] + - [125, 4543.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [197, 3410.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [165, 5752.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 64] + - [197, 2645.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [137, 4131.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [186, 4629.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [137, 5369.0] + - - [704, 128, 1, 1280, 704, 704, 704, 128] + - [146, 4256.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [137, 6084.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [186, 5338.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [139, 5850.0] + - - [704, 256, 1, 128, 704, 704, 704, 256] + - [181, 3897.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [125, 3977.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [165, 5900.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [131, 5107.0] + - - [448, 448, 1, 256, 448, 448, 448, 448] + - [131, 4893.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 1024] + - [190, 5159.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [137, 5395.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [131, 5123.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 128] + - [137, 4766.0] + - - [448, 256, 1, 3328, 448, 448, 448, 256] + - [192, 5504.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [182, 3342.0] + - - [128, 704, 1, 1280, 128, 128, 128, 704] + - [180, 4278.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 128] + - [194, 6107.0] + - - [64, 2944, 1, 128, 64, 64, 64, 2944] + - [125, 4033.0] + - - [448, 448, 1, 3328, 448, 448, 448, 448] + - [131, 5690.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 128] + - [131, 5026.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1856] + - [194, 5958.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [142, 4135.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [186, 4664.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [135, 4522.0] + - - [128, 1408, 1, 256, 128, 128, 128, 1408] + - [131, 4419.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 64] + - [186, 4879.0] + - - [256, 448, 1, 3328, 256, 256, 256, 448] + - [137, 5551.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [137, 5505.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [127, 4459.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [125, 3897.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 256] + - [186, 4739.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 64] + - [125, 4130.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [181, 4259.0] + - - [704, 128, 1, 256, 704, 704, 704, 128] + - [180, 3363.0] + - - [448, 256, 1, 1280, 448, 448, 448, 256] + - [165, 5206.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 128] + - [167, 5977.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [194, 5124.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 64] + - [186, 4476.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1024] + - [137, 5655.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [139, 5766.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 1856] + - [139, 6114.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [186, 5292.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [137, 6077.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [197, 4247.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [131, 4436.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 64] + - [127, 3849.0] + - - [64, 1408, 1, 128, 64, 64, 64, 1408] + - [197, 2720.0] + - - [704, 256, 1, 3328, 704, 704, 704, 256] + - [173, 5117.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [131, 4576.0] + - - [448, 256, 1, 128, 448, 448, 448, 256] + - [125, 3383.0] + - - [704, 128, 1, 3328, 704, 704, 704, 128] + - [163, 4516.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [125, 2682.0] + - - [256, 448, 1, 1280, 256, 256, 256, 448] + - [137, 5184.0] + - - [704, 256, 1, 1280, 704, 704, 704, 256] + - [160, 5028.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [137, 5792.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 64] + - [154, 3487.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [181, 2670.0] + - - [256, 704, 1, 3328, 256, 256, 256, 704] + - [160, 5156.0] + - - [256, 448, 1, 128, 256, 256, 256, 448] + - [125, 3291.0] + - - [64, 3584, 1, 128, 64, 64, 64, 3584] + - [186, 4517.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 128] + - [127, 3938.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [186, 5241.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 1408] + - [131, 5143.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [181, 3286.0] + - - [64, 1856, 1, 128, 64, 64, 64, 1856] + - [125, 3503.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [156, 4480.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 128] + - [150, 5063.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [127, 4650.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [137, 5514.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1024] + - [137, 4829.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [165, 5896.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [150, 4387.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [186, 5191.0] + - - [256, 704, 1, 128, 256, 256, 256, 704] + - [154, 3871.0] + - - [256, 1024, 1, 128, 256, 256, 256, 1024] + - [131, 4700.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [153, 4291.0] + - - [448, 448, 1, 1280, 448, 448, 448, 448] + - [186, 5570.0] + - - [128, 1024, 1, 256, 128, 128, 128, 1024] + - [197, 4023.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [139, 5920.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 128] + - [186, 4479.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 1024] + - [137, 5797.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [137, 5741.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [125, 4170.0] + - - [128, 704, 1, 256, 128, 128, 128, 704] + - [123, 3286.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [167, 5786.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [167, 5027.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [165, 5685.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 128] + - [131, 5082.0] + - - [128, 704, 1, 3328, 128, 128, 128, 704] + - [180, 4453.0] + - - [128, 1856, 1, 256, 128, 128, 128, 1856] + - [139, 5288.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [131, 5282.0] + - - [256, 704, 1, 1280, 256, 256, 256, 704] + - [186, 5030.0] + - - [64, 2368, 1, 128, 64, 64, 64, 2368] + - [182, 3819.0] + - - [64, 4288, 1, 128, 64, 64, 64, 4288] + - [186, 4879.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 128] + - [139, 5307.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [142, 3433.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [131, 5350.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1408] + - [186, 5006.0] + - - [448, 448, 1, 128, 448, 448, 448, 448] + - [125, 4340.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [186, 4436.0] + - - [49, 512, 128, 2048, 49, 49, 49, 512] + - [206, 5170.0] + - - [49, 2048, 128, 512, 49, 49, 49, 2048] + - [139, 5216.0] + - - [49, 2048, 256, 512, 49, 49, 49, 2048] + - [152, 5232.0] + - - [49, 512, 256, 2048, 49, 49, 49, 512] + - [179, 5233.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [142, 3639.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [133, 5736.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [154, 5197.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [131, 5236.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [175, 4710.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [133, 4441.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [188, 3359.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [133, 4027.0] + - - [64, 64, 36, 3136, 64, 64, 64, 64] + - [150, 5521.0] + - - [64, 64, 64, 826, 64, 64, 64, 64] + - [165, 5438.0] + - - [64, 64, 64, 1600, 64, 64, 64, 64] + - [205, 5721.0] + - - [64, 96, 64, 288, 64, 64, 64, 96] + - [186, 5651.0] + - - [96, 96, 36, 1568, 96, 96, 96, 96] + - [158, 5797.0] + - - [96, 96, 36, 2592, 96, 96, 96, 96] + - [198, 5804.0] + - - [64, 96, 64, 800, 64, 64, 64, 96] + - [160, 5987.0] + - - [35, 96, 36, 8960, 35, 35, 35, 96] + - [192, 2984.0] + - - [32, 64, 36, 43808, 32, 32, 32, 64] + - [149, 3819.0] + - - [64, 64, 64, 81, 64, 64, 64, 64] + - [181, 3470.0] + - - [64, 96, 36, 512, 64, 64, 64, 96] + - [186, 5092.0] + - - [64, 64, 64, 3200, 64, 64, 64, 64] + - [137, 5728.0] + - - [64, 64, 36, 3520, 64, 64, 64, 64] + - [178, 5598.0] + - - [64, 64, 64, 5408, 64, 64, 64, 64] + - [173, 5469.0] + - - [35, 96, 36, 13440, 35, 35, 35, 96] + - [150, 2981.0] + - - [96, 96, 64, 1152, 96, 96, 96, 96] + - [172, 5648.0] + - - [32, 64, 36, 90, 32, 32, 32, 64] + - [185, 1885.0] + - - [64, 64, 64, 800, 64, 64, 64, 64] + - [137, 5481.0] + - - [64, 64, 36, 1568, 64, 64, 64, 64] + - [137, 5433.0] + - - [64, 64, 36, 196, 64, 64, 64, 64] + - [182, 4048.0] + - - [35, 96, 64, 4235, 35, 35, 35, 96] + - [150, 3235.0] + - - [149, 32, 36, 19072, 149, 149, 149, 32] + - [184, 4579.0] + - - [64, 96, 36, 1568, 64, 64, 64, 96] + - [165, 5258.0] + - - [96, 96, 64, 800, 96, 96, 96, 96] + - [144, 5700.0] + - - [32, 64, 64, 640, 32, 32, 32, 64] + - [190, 4311.0] + - - [64, 64, 36, 392, 64, 64, 64, 64] + - [156, 4624.0] + - - [64, 64, 64, 1652, 64, 64, 64, 64] + - [150, 5712.0] + - - [64, 96, 36, 2592, 64, 64, 64, 96] + - [178, 5523.0] + - - [64, 64, 36, 6272, 64, 64, 64, 64] + - [150, 5619.0] + - - [32, 64, 64, 20000, 32, 32, 32, 64] + - [180, 3782.0] + - - [64, 64, 64, 648, 64, 64, 64, 64] + - [186, 5135.0] + - - [32, 64, 36, 1440, 32, 32, 32, 64] + - [158, 4247.0] + - - [64, 64, 64, 100, 64, 64, 64, 64] + - [131, 4269.0] + - - [64, 96, 64, 4608, 64, 64, 64, 96] + - [173, 5554.0] + - - [64, 64, 64, 200, 64, 64, 64, 64] + - [186, 4398.0] + - - [32, 64, 64, 40, 32, 32, 32, 64] + - [124, 1808.0] + - - [64, 96, 64, 1152, 64, 64, 64, 96] + - [178, 5979.0] + - - [149, 32, 64, 8195, 149, 149, 149, 32] + - [169, 4625.0] + - - [35, 96, 64, 6160, 35, 35, 35, 96] + - [205, 3265.0] + - - [64, 64, 36, 1760, 64, 64, 64, 64] + - [165, 5239.0] + - - [64, 2880, 1, 320, 64, 64, 64, 2880] + - [131, 4637.0] + - - [49, 832, 32, 256, 49, 49, 49, 832] + - [139, 4847.0] + - - [289, 1120, 1, 160, 289, 289, 289, 1120] + - [186, 4813.0] + - - [64, 1728, 1, 320, 64, 64, 64, 1728] + - [154, 4213.0] + - - [49, 832, 32, 160, 49, 49, 49, 832] + - [139, 4875.0] + - - [49, 832, 32, 384, 49, 49, 49, 832] + - [139, 5035.0] + - - [289, 896, 1, 192, 289, 289, 289, 896] + - [131, 4884.0] + - - [289, 896, 1, 128, 289, 289, 289, 896] + - [160, 4578.0] + - - [196, 800, 1, 64, 196, 196, 196, 800] + - [154, 2560.0] + - - [64, 1344, 1, 512, 64, 64, 64, 1344] + - [131, 3701.0] + - - [64, 1152, 1, 384, 64, 64, 64, 1152] + - [131, 3378.0] + - - [64, 1152, 1, 448, 64, 64, 64, 1152] + - [188, 3544.0] + - - [49, 832, 32, 128, 49, 49, 49, 832] + - [131, 4768.0] + - - [49, 832, 32, 48, 49, 49, 49, 832] + - [196, 4237.0] + - - [64, 1152, 1, 256, 64, 64, 64, 1152] + - [198, 2922.0] + - - [49, 832, 32, 32, 49, 49, 49, 832] + - [125, 3851.0] + - - [289, 1120, 1, 192, 289, 289, 289, 1120] + - [131, 4917.0] + - - [196, 600, 1, 64, 196, 196, 196, 600] + - [197, 2079.0] + - - [49, 832, 32, 192, 49, 49, 49, 832] + - [167, 4906.0] + - - [64, 1728, 1, 192, 64, 64, 64, 1728] + - [125, 3738.0] + - - [64, 38, 840, 38, 64, 64, 64, 38] + - [154, 3500.0] + - - [64, 49, 648, 49, 64, 64, 64, 49] + - [131, 4457.0] + - - [64, 32, 992, 32, 64, 64, 64, 32] + - [154, 4888.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [125, 3204.0] + - - [64, 41, 776, 41, 64, 64, 64, 41] + - [133, 3837.0] + - - [64, 45, 712, 45, 64, 64, 64, 45] + - [162, 4241.0] + - - [64, 54, 592, 54, 64, 64, 64, 54] + - [125, 4963.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [131, 5519.0] + - - [49, 512, 64, 2048, 49, 49, 49, 512] + - [167, 5116.0] + - - [49, 2048, 64, 512, 49, 49, 49, 2048] + - [194, 5183.0] + - - [33, 32, 1600, 33, 33, 33, 33, 32] + - [180, 2752.0] + - - [33, 32, 200, 33, 33, 33, 33, 32] + - [125, 1563.0] + - - [67, 2048, 1, 512, 67, 67, 67, 2048] + - [180, 3414.0] + - - [512, 512, 1, 3780, 512, 512, 512, 512] + - [137, 5831.0] + - - [512, 512, 1, 3796, 512, 512, 512, 512] + - [165, 5828.0] + - - [512, 512, 1, 3822, 512, 512, 512, 512] + - [205, 5810.0] + - - [512, 512, 1, 3840, 512, 512, 512, 512] + - [165, 5831.0] + - - [512, 512, 1, 3859, 512, 512, 512, 512] + - [137, 5816.0] + - - [512, 512, 1, 3870, 512, 512, 512, 512] + - [165, 5819.0] + - - [512, 512, 1, 3876, 512, 512, 512, 512] + - [192, 5821.0] + - - [512, 512, 1, 3906, 512, 512, 512, 512] + - [205, 5809.0] + - - [512, 512, 1, 3910, 512, 512, 512, 512] + - [192, 5822.0] + - - [512, 512, 1, 3925, 512, 512, 512, 512] + - [137, 5822.0] + - - [512, 512, 1, 3927, 512, 512, 512, 512] + - [165, 5830.0] + - - [512, 512, 1, 3942, 512, 512, 512, 512] + - [137, 5827.0] + - - [512, 512, 1, 3944, 512, 512, 512, 512] + - [205, 5816.0] + - - [512, 512, 1, 3955, 512, 512, 512, 512] + - [192, 5827.0] + - - [512, 512, 1, 3968, 512, 512, 512, 512] + - [137, 5838.0] + - - [512, 512, 1, 3969, 512, 512, 512, 512] + - [165, 5831.0] + - - [512, 512, 1, 3976, 512, 512, 512, 512] + - [137, 5829.0] + - - [512, 512, 1, 3977, 512, 512, 512, 512] + - [205, 5809.0] + - - [512, 512, 1, 3978, 512, 512, 512, 512] + - [137, 5820.0] + - - [512, 512, 1, 3990, 512, 512, 512, 512] + - [137, 5826.0] + - - [512, 512, 1, 3995, 512, 512, 512, 512] + - [192, 5850.0] + - - [512, 512, 1, 3996, 512, 512, 512, 512] + - [137, 5840.0] + - - [512, 512, 1, 3999, 512, 512, 512, 512] + - [192, 5825.0] + - - [512, 512, 1, 4005, 512, 512, 512, 512] + - [192, 5830.0] + - - [512, 512, 1, 4012, 512, 512, 512, 512] + - [137, 5812.0] + - - [512, 512, 1, 4020, 512, 512, 512, 512] + - [205, 5827.0] + - - [512, 512, 1, 4026, 512, 512, 512, 512] + - [165, 5828.0] + - - [512, 512, 1, 4030, 512, 512, 512, 512] + - [192, 5830.0] + - - [512, 512, 1, 4032, 512, 512, 512, 512] + - [137, 5842.0] + - - [512, 512, 1, 4050, 512, 512, 512, 512] + - [192, 5787.0] + - - [512, 512, 1, 4059, 512, 512, 512, 512] + - [165, 5858.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [156, 4657.0] + - - [384, 192, 1, 384, 384, 384, 384, 192] + - [200, 3478.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 256] + - [131, 5362.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [165, 5653.0] + - - [1024, 256, 1, 2304, 1024, 1024, 1024, 256] + - [165, 5789.0] + - - [1024, 256, 1, 2816, 1024, 1024, 1024, 256] + - [192, 5827.0] + - - [1024, 256, 1, 3072, 1024, 1024, 1024, 256] + - [137, 5849.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [192, 5817.0] + - - [1024, 256, 1, 3584, 1024, 1024, 1024, 256] + - [165, 5836.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 256] + - [137, 5857.0] + - - [1024, 256, 1, 4352, 1024, 1024, 1024, 256] + - [192, 5834.0] + - - [1024, 256, 1, 4608, 1024, 1024, 1024, 256] + - [165, 5846.0] + - - [1024, 256, 1, 5120, 1024, 1024, 1024, 256] + - [192, 5912.0] + - - [1024, 256, 1, 5376, 1024, 1024, 1024, 256] + - [192, 5924.0] + - - [1024, 256, 1, 5632, 1024, 1024, 1024, 256] + - [192, 5948.0] + - - [1024, 256, 1, 6144, 1024, 1024, 1024, 256] + - [137, 5956.0] + - - [1024, 256, 1, 6400, 1024, 1024, 1024, 256] + - [192, 5985.0] + - - [1024, 256, 1, 7680, 1024, 1024, 1024, 256] + - [137, 5951.0] + - - [1024, 256, 1, 7936, 1024, 1024, 1024, 256] + - [137, 5972.0] + - - [512, 512, 1, 1600, 512, 512, 512, 512] + - [192, 5738.0] + - - [100, 2048, 1, 512, 100, 100, 100, 2048] + - [131, 4181.0] + - - [74, 2048, 1, 512, 74, 74, 74, 2048] + - [158, 3800.0] + - - [74, 2048, 1, 960, 74, 74, 74, 2048] + - [129, 3939.0] + - - [768, 128, 1, 128, 768, 768, 768, 128] + - [181, 2834.0] + - - [1152, 128, 1, 128, 1152, 1152, 1152, 128] + - [127, 3730.0] + - - [1536, 128, 1, 128, 1536, 1536, 1536, 128] + - [125, 4251.0] + - - [1920, 128, 1, 128, 1920, 1920, 1920, 128] + - [186, 4795.0] + - - [768, 128, 1, 256, 768, 768, 768, 128] + - [142, 3575.0] + - - [1152, 128, 1, 256, 1152, 1152, 1152, 128] + - [127, 4400.0] + - - [1536, 128, 1, 256, 1536, 1536, 1536, 128] + - [131, 4793.0] + - - [1920, 128, 1, 256, 1920, 1920, 1920, 128] + - [139, 5442.0] + - - [448, 448, 1, 448, 448, 448, 448, 448] + - [160, 5252.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 32] + - [131, 6196.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 48] + - [139, 4790.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 48] + - [179, 4787.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 48] + - [194, 4771.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 32] + - [186, 5874.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 48] + - [139, 4710.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 48] + - [167, 4761.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 48] + - [167, 4760.0] + - - [49, 2048, 32, 512, 49, 49, 49, 2048] + - [139, 5082.0] + - - [49, 512, 32, 2048, 49, 49, 49, 512] + - [139, 5031.0] + - - [512, 256, 1, 4096, 512, 512, 512, 256] + - [177, 5087.0] + - - [512, 256, 1, 6912, 512, 512, 512, 256] + - [190, 5268.0] + - - [100, 2304, 1, 512, 100, 100, 100, 2304] + - [139, 4287.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [129, 5377.0] + - - [512, 480, 1, 512, 512, 512, 512, 480] + - [160, 5633.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [186, 5377.0] + - - [32, 64, 4608, 32, 32, 32, 32, 64] + - [196, 4860.0] + - - [32, 64, 4608, 35, 32, 32, 32, 64] + - [123, 4873.0] + - - [34, 64, 4736, 24, 34, 34, 34, 64] + - [133, 3300.0] + - - [34, 64, 4736, 34, 34, 34, 34, 64] + - [125, 3246.0] + - - [35, 64, 4608, 35, 35, 35, 35, 64] + - [162, 3326.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [160, 4876.0] + - - [64, 32, 4608, 35, 64, 64, 64, 32] + - [125, 4769.0] + - - [64, 34, 4736, 24, 64, 64, 64, 34] + - [133, 3347.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [125, 3336.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [197, 3406.0] + - - [256, 864, 1, 1, 256, 256, 256, 864] + - [194, 109.0] + - - [512, 256, 1, 3456, 512, 512, 512, 256] + - [190, 5065.0] + - - [512, 256, 1, 864, 512, 512, 512, 256] + - [137, 4683.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 1024] + - [139, 5172.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 2048] + - [139, 5220.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 1024] + - [206, 5093.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 2048] + - [139, 5147.0] + - - [49, 4608, 1, 512, 49, 49, 49, 4608] + - [167, 3981.0] + - - [56, 512, 64, 512, 56, 56, 56, 512] + - [186, 5753.0] + - - [228, 256, 2, 12, 228, 228, 228, 256] + - [199, 661.0] + - - [228, 256, 2, 3, 228, 228, 228, 256] + - [130, 222.0] + - - [187, 256, 2, 12, 187, 187, 187, 256] + - [130, 631.0] + - - [247, 256, 2, 12, 247, 247, 247, 256] + - [124, 751.0] + - - [176, 256, 2, 3, 176, 176, 176, 256] + - [130, 196.0] + - - [187, 256, 2, 3, 187, 187, 187, 256] + - [130, 189.0] + - - [221, 256, 2, 3, 221, 221, 221, 256] + - [130, 212.0] + - - [221, 256, 2, 12, 221, 221, 221, 256] + - [124, 715.0] + - - [176, 256, 2, 12, 176, 176, 176, 256] + - [124, 659.0] + - - [247, 256, 2, 3, 247, 247, 247, 256] + - [130, 221.0] + - - [216, 256, 2, 3, 216, 216, 216, 256] + - [130, 221.0] + - - [192, 256, 2, 12, 192, 192, 192, 256] + - [126, 694.0] + - - [192, 256, 2, 3, 192, 192, 192, 256] + - [130, 214.0] + - - [216, 256, 2, 12, 216, 216, 216, 256] + - [124, 713.0] + - - [32, 32, 36, 43808, 32, 32, 32, 32] + - [220, 3019.0] + - - [32, 32, 64, 20000, 32, 32, 32, 32] + - [149, 2993.0] + - - [256, 128, 1, 32768, 256, 256, 256, 128] + - [221, 4984.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 4] + - [227, 1166.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 4] + - [228, 658.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 4] + - [226, 859.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 4] + - [233, 1604.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 4] + - [226, 674.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 4] + - [231, 446.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 4] + - [228, 549.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 4] + - [231, 871.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 4] + - [229, 1302.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 4] + - [223, 833.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 4] + - [228, 536.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 4] + - [159, 334.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 4] + - [225, 1459.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 4] + - [234, 576.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 4] + - [223, 949.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 4] + - [229, 1662.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 4] + - [222, 740.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 4] + - [233, 1402.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 4] + - [225, 1089.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 4] + - [159, 377.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 4] + - [145, 295.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 4] + - [232, 518.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 4] + - [223, 1190.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 4] + - [228, 624.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 4] + - [226, 750.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 4] + - [228, 749.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 4] + - [226, 950.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 4] + - [228, 1541.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 4] + - [225, 1174.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 4] + - [199, 428.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 4] + - [159, 234.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 4] + - [224, 984.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 4] + - [229, 1537.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 4] + - [225, 989.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 4] + - [223, 1112.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 4] + - [230, 1808.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 1] + - [226, 149.0] + - - [2048, 1, 1, 960, 2048, 2048, 2048, 1] + - [226, 177.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [222, 7.0] + - - [2560, 2, 1, 4, 2560, 2560, 2560, 2] + - [222, 16.0] + - - [2048, 2, 1, 8, 2048, 2048, 2048, 2] + - [130, 24.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [222, 8.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 1856] + - [245, 731.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 2944] + - [245, 866.0] + - - [4, 1408, 1, 128, 4, 4, 4, 1408] + - [140, 228.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 2368] + - [249, 855.0] + - - [4, 3584, 1, 128, 4, 4, 4, 3584] + - [247, 537.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 5888] + - [239, 1035.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 1408] + - [237, 577.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 6784] + - [242, 1004.0] + - - [4, 4288, 1, 128, 4, 4, 4, 4288] + - [238, 613.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 5056] + - [249, 1156.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 6784] + - [249, 1005.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 2944] + - [250, 923.0] + - - [4, 5056, 1, 256, 4, 4, 4, 5056] + - [238, 846.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 5056] + - [245, 1103.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 2368] + - [237, 942.0] + - - [4, 1856, 1, 256, 4, 4, 4, 1856] + - [171, 420.0] + - - [4, 2368, 1, 256, 4, 4, 4, 2368] + - [243, 503.0] + - - [4, 2944, 1, 256, 4, 4, 4, 2944] + - [247, 605.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 4288] + - [241, 979.0] + - - [4, 6784, 1, 128, 4, 4, 4, 6784] + - [247, 752.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 3584] + - [239, 988.0] + - - [4, 5888, 1, 256, 4, 4, 4, 5888] + - [242, 866.0] + - - [4, 6784, 1, 256, 4, 4, 4, 6784] + - [239, 835.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1408] + - [249, 526.0] + - - [4, 3584, 1, 256, 4, 4, 4, 3584] + - [243, 722.0] + - - [4, 1408, 1, 256, 4, 4, 4, 1408] + - [244, 323.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 4288] + - [237, 1050.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 5888] + - [235, 995.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1856] + - [249, 683.0] + - - [4, 1856, 1, 128, 4, 4, 4, 1856] + - [171, 299.0] + - - [4, 2944, 1, 128, 4, 4, 4, 2944] + - [248, 441.0] + - - [4, 5056, 1, 128, 4, 4, 4, 5056] + - [235, 703.0] + - - [4, 4288, 1, 256, 4, 4, 4, 4288] + - [243, 757.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3584] + - [245, 1053.0] + - - [4, 5888, 1, 128, 4, 4, 4, 5888] + - [246, 650.0] + - - [4, 2368, 1, 128, 4, 4, 4, 2368] + - [171, 367.0] + - - [49, 1200, 1, 128, 49, 49, 49, 1200] + - [236, 1882.0] + - - [1, 1152, 1, 256, 1, 1, 1, 1152] + - [240, 68.0] + - - [25, 1152, 1, 256, 25, 25, 25, 1152] + - [132, 1582.0] + - - [9, 1152, 1, 256, 9, 9, 9, 1152] + - [240, 617.0] + - - [16, 32, 36, 5760, 16, 16, 16, 32] + - [213, 2330.0] + - - [3, 64, 36, 6272, 3, 3, 3, 64] + - [219, 520.0] + - - [3, 64, 64, 46208, 3, 3, 3, 64] + - [189, 490.0] + - - [3, 64, 64, 92416, 3, 3, 3, 64] + - [176, 489.0] + - - [1, 16, 36, 23040, 1, 1, 1, 16] + - [219, 161.0] + - - [1, 16, 64, 10240, 1, 1, 1, 16] + - [209, 162.0] + - - [3, 64, 36, 25088, 3, 3, 3, 64] + - [218, 477.0] + - - [3, 64, 64, 11552, 3, 3, 3, 64] + - [212, 499.0] + - - [3, 64, 36, 200704, 3, 3, 3, 64] + - [215, 474.0] + - - [3, 64, 64, 23104, 3, 3, 3, 64] + - [203, 490.0] + - - [3, 64, 36, 100352, 3, 3, 3, 64] + - [189, 474.0] + - - [3, 64, 36, 50176, 3, 3, 3, 64] + - [148, 475.0] + - - [8, 384, 64, 6600, 8, 8, 8, 384] + - [207, 1348.0] + - - [65, 1024, 1, 6400, 65, 65, 65, 1024] + - [208, 3780.0] + - - [13, 512, 1, 32768, 13, 13, 13, 512] + - [209, 1916.0] + - - [256, 1, 1, 32768, 256, 256, 256, 1] + - [214, 110.0] + - - [256, 4, 1, 6912, 256, 256, 256, 4] + - [210, 350.0] + - - [13, 512, 1, 55296, 13, 13, 13, 512] + - [213, 1833.0] + - - [1024, 2, 1, 4992, 1024, 1024, 1024, 2] + - [217, 307.0] + - - [1024, 2, 1, 5120, 1024, 1024, 1024, 2] + - [217, 315.0] + - - [1024, 2, 1, 5248, 1024, 1024, 1024, 2] + - [210, 315.0] + - - [13, 512, 1, 6912, 13, 13, 13, 512] + - [216, 1650.0] + - - [256, 1, 1, 6912, 256, 256, 256, 1] + - [210, 87.0] + - - [256, 128, 1, 6912, 256, 256, 256, 128] + - [211, 4439.0] + - - [768, 2, 1, 4608, 768, 768, 768, 2] + - [217, 301.0] + - - [1024, 2, 1, 4608, 1024, 1024, 1024, 2] + - [217, 311.0] + - - [1024, 64, 1, 512, 1024, 1024, 1024, 64] + - [200, 2954.0] + - - [512, 32, 1, 200, 512, 512, 512, 32] + - [161, 836.0] + - - [4, 704, 1, 1280, 4, 4, 4, 704] + - [189, 265.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [201, 552.0] + - - [64, 4, 1, 256, 64, 64, 64, 4] + - [132, 16.0] + - - [64, 704, 1, 128, 64, 64, 64, 704] + - [124, 1737.0] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [185, 2440.0] + - - [128, 4, 1, 1280, 128, 128, 128, 4] + - [132, 52.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [188, 3898.0] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [162, 2705.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 64] + - [141, 1762.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [186, 3884.0] + - - [4, 704, 1, 256, 4, 4, 4, 704] + - [199, 159.0] + - - [704, 4, 1, 1280, 704, 704, 704, 4] + - [136, 280.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [185, 1756.0] + - - [64, 1024, 1, 128, 64, 64, 64, 1024] + - [164, 2087.0] + - - [4, 64, 1, 1280, 4, 4, 4, 64] + - [132, 26.0] + - - [128, 256, 1, 3328, 128, 128, 128, 256] + - [183, 2405.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [185, 2434.0] + - - [448, 4, 1, 256, 448, 448, 448, 4] + - [132, 112.0] + - - [448, 4, 1, 1280, 448, 448, 448, 4] + - [166, 183.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [161, 22.0] + - - [256, 4, 1, 128, 256, 256, 256, 4] + - [132, 43.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [147, 2937.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [201, 549.0] + - - [704, 64, 1, 128, 704, 704, 704, 64] + - [124, 1727.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 4] + - [185, 231.0] + - - [256, 256, 1, 128, 256, 256, 256, 256] + - [191, 2066.0] + - - [64, 256, 1, 128, 64, 64, 64, 256] + - [132, 765.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [162, 2705.0] + - - [128, 448, 1, 256, 128, 128, 128, 448] + - [164, 2383.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [201, 1403.0] + - - [128, 256, 1, 1280, 128, 128, 128, 256] + - [128, 2348.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [185, 2600.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [185, 1407.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [138, 963.0] + - - [128, 128, 1, 3328, 128, 128, 128, 128] + - [138, 1912.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [201, 1577.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [130, 2596.0] + - - [256, 256, 1, 3328, 256, 256, 256, 256] + - [188, 4227.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 4] + - [136, 447.0] + - - [4, 4, 1, 256, 4, 4, 4, 4] + - [122, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [201, 1098.0] + - - [256, 128, 1, 1280, 256, 256, 256, 128] + - [128, 2297.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [166, 858.0] + - - [4, 448, 1, 3328, 4, 4, 4, 448] + - [138, 208.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [186, 2655.0] + - - [256, 4, 1, 1280, 256, 256, 256, 4] + - [138, 104.0] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [122, 2090.0] + - - [4, 704, 1, 128, 4, 4, 4, 704] + - [130, 119.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [132, 666.0] + - - [448, 128, 1, 256, 448, 448, 448, 128] + - [196, 2497.0] + - - [448, 64, 1, 128, 448, 448, 448, 64] + - [199, 1266.0] + - - [4, 448, 1, 1280, 4, 4, 4, 448] + - [193, 183.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [170, 2769.0] + - - [256, 64, 1, 128, 256, 256, 256, 64] + - [201, 765.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 1024] + - [134, 408.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [162, 4227.0] + - - [704, 4, 1, 128, 704, 704, 704, 4] + - [132, 118.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [132, 63.0] + - - [256, 4, 1, 3328, 256, 256, 256, 4] + - [138, 119.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [161, 64.0] + - - [4, 4, 1, 128, 4, 4, 4, 4] + - [122, 1.0] + - - [4, 128, 1, 256, 4, 4, 4, 128] + - [161, 32.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [201, 396.0] + - - [448, 128, 1, 3328, 448, 448, 448, 128] + - [188, 3717.0] + - - [4, 448, 1, 128, 4, 4, 4, 448] + - [141, 71.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [138, 1702.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [159, 2156.0] + - - [4, 128, 1, 3328, 4, 4, 4, 128] + - [138, 60.0] + - - [64, 4, 1, 128, 64, 64, 64, 4] + - [132, 11.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [201, 276.0] + - - [4, 704, 1, 3328, 4, 4, 4, 704] + - [134, 300.0] + - - [4, 4, 1, 1280, 4, 4, 4, 4] + - [132, 2.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [132, 765.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 4] + - [132, 175.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [166, 484.0] + - - [4, 64, 1, 128, 4, 4, 4, 64] + - [132, 11.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [193, 830.0] + - - [128, 128, 1, 1280, 128, 128, 128, 128] + - [138, 1697.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [185, 1898.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [138, 1699.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 4] + - [191, 402.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [122, 2090.0] + - - [128, 448, 1, 1280, 128, 128, 128, 448] + - [162, 3475.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [151, 957.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [185, 1748.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [164, 1315.0] + - - [4, 256, 1, 128, 4, 4, 4, 256] + - [161, 44.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [200, 2672.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [161, 383.0] + - - [4, 4, 1, 3328, 4, 4, 4, 4] + - [130, 2.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1024] + - [189, 378.0] + - - [704, 4, 1, 256, 704, 704, 704, 4] + - [185, 170.0] + - - [128, 4, 1, 3328, 128, 128, 128, 4] + - [138, 60.0] + - - [448, 4, 1, 3328, 448, 448, 448, 4] + - [193, 208.0] + - - [704, 4, 1, 3328, 704, 704, 704, 4] + - [164, 310.0] + - - [448, 128, 1, 1280, 448, 448, 448, 128] + - [133, 3408.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [188, 4248.0] + - - [4, 1024, 1, 128, 4, 4, 4, 1024] + - [168, 160.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [138, 1900.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [132, 1963.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [201, 1362.0] + - - [128, 4, 1, 256, 128, 128, 128, 4] + - [187, 32.0] + - - [256, 256, 1, 1280, 256, 256, 256, 256] + - [202, 3855.0] + - - [256, 128, 1, 3328, 256, 256, 256, 128] + - [193, 2408.0] + - - [448, 4, 1, 128, 448, 448, 448, 4] + - [201, 75.0] + - - [4, 256, 1, 3328, 4, 4, 4, 256] + - [138, 119.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [161, 22.0] + - - [4, 256, 1, 1280, 4, 4, 4, 256] + - [138, 105.0] + - - [64, 4, 1, 3328, 64, 64, 64, 4] + - [138, 30.0] + - - [4, 64, 1, 3328, 4, 4, 4, 64] + - [138, 30.0] + - - [4, 1024, 1, 256, 4, 4, 4, 1024] + - [143, 238.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [132, 1098.0] + - - [4, 64, 1, 256, 4, 4, 4, 64] + - [132, 16.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [126, 1995.0] + - - [64, 448, 1, 128, 64, 64, 64, 448] + - [145, 1248.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [202, 2931.0] + - - [128, 448, 1, 3328, 128, 128, 128, 448] + - [188, 3739.0] + - - [4, 448, 1, 256, 4, 4, 4, 448] + - [132, 105.0] + - - [4, 128, 1, 1280, 4, 4, 4, 128] + - [132, 52.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [201, 386.0] + - - [64, 64, 1, 128, 64, 64, 64, 64] + - [132, 193.0] + - - [64, 4, 1, 1280, 64, 64, 64, 4] + - [132, 26.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [151, 1900.0] + - - [128, 128, 1, 256, 128, 128, 128, 128] + - [201, 1092.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [125, 3990.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [125, 3270.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [131, 4035.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [123, 1313.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [125, 918.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [125, 1180.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [131, 4496.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [130, 2945.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [125, 1756.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [125, 3630.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [125, 2628.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [142, 1483.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [125, 2427.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [125, 2189.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [142, 1966.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [154, 4677.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [181, 2891.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [154, 3091.0] + - - [768, 2, 1, 16, 768, 768, 768, 2] + - [124, 15.0] + - - [768, 2, 1, 32, 768, 768, 768, 2] + - [124, 27.0] + - - [3, 64, 64, 2888, 3, 3, 3, 64] + - [134, 500.0] + - - [1, 16, 64, 640, 1, 1, 1, 16] + - [189, 79.0] + - - [512, 24, 36, 800, 512, 512, 512, 24] + - [131, 4615.0] + - - [16, 32, 36, 360, 16, 16, 16, 32] + - [143, 1189.0] + - - [1, 16, 36, 1440, 1, 1, 1, 16] + - [136, 58.0] + - - [512, 24, 64, 512, 512, 512, 512, 24] + - [146, 4780.0] + - - [3, 64, 36, 3136, 3, 3, 3, 64] + - [183, 477.0] + - - [256, 24, 64, 32, 256, 256, 256, 24] + - [125, 2568.0] + - - [256, 16, 36, 3200, 256, 256, 256, 16] + - [159, 2925.0] + - - [256, 16, 36, 32, 256, 256, 256, 16] + - [199, 1616.0] + - - [512, 24, 36, 288, 512, 512, 512, 24] + - [131, 4427.0] + - - [512, 24, 64, 128, 512, 512, 512, 24] + - [131, 4502.0] + - - [3, 64, 64, 1444, 3, 3, 3, 64] + - [134, 498.0] + - - [16, 32, 64, 160, 16, 16, 16, 32] + - [128, 1440.0] + - - [256, 16, 64, 32, 256, 256, 256, 16] + - [185, 2243.0] + - - [256, 16, 64, 1568, 256, 256, 256, 16] + - [164, 3258.0] + - - [256, 24, 36, 128, 256, 256, 256, 24] + - [200, 3174.0] + - - [16, 32, 64, 2560, 16, 16, 16, 32] + - [157, 2395.0] + - - [49, 800, 1, 128, 49, 49, 49, 800] + - [124, 1459.0] + - - [64, 12, 2520, 12, 64, 64, 64, 12] + - [170, 1798.0] + - - [64, 13, 2336, 13, 64, 64, 64, 13] + - [142, 1993.0] + - - [64, 14, 2184, 14, 64, 64, 64, 14] + - [142, 2192.0] + - - [64, 15, 2048, 15, 64, 64, 64, 15] + - [197, 2386.0] + - - [64, 16, 1920, 16, 64, 64, 64, 16] + - [130, 2769.0] + - - [64, 17, 1816, 17, 64, 64, 64, 17] + - [154, 2616.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [154, 2808.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [154, 2955.0] + - - [64, 21, 1488, 21, 64, 64, 64, 21] + - [125, 3266.0] + - - [64, 23, 1360, 23, 64, 64, 64, 23] + - [181, 3483.0] + - - [64, 25, 1256, 25, 64, 64, 64, 25] + - [125, 3700.0] + - - [64, 27, 1168, 27, 64, 64, 64, 27] + - [181, 4116.0] + - - [64, 29, 1088, 29, 64, 64, 64, 29] + - [154, 4403.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 2] + - [185, 151.0] + - - [1024, 2, 1, 3072, 1024, 1024, 1024, 2] + - [136, 222.0] + - - [1024, 2, 1, 6, 1024, 1024, 1024, 2] + - [130, 9.0] + - - [3, 64, 512, 3, 3, 3, 3, 64] + - [153, 101.0] + - - [9, 64, 512, 9, 9, 9, 9, 64] + - [128, 725.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 1] + - [185, 55.0] + - - [5, 64, 512, 5, 5, 5, 5, 64] + - [153, 264.0] + - - [1024, 2, 1, 1, 1024, 1024, 1024, 2] + - [130, 2.0] + - - [1024, 2, 1, 2048, 1024, 1024, 1024, 2] + - [136, 214.0] + - - [17, 64, 1, 15, 17, 17, 17, 64] + - [130, 9.0] + - - [17, 64, 1, 17, 17, 17, 17, 64] + - [148, 12.0] + - - [30, 64, 1, 30, 30, 30, 30, 64] + - [138, 31.0] + - - [30, 64, 1, 31, 30, 30, 30, 64] + - [138, 33.0] + - - [31, 64, 1, 31, 31, 31, 31, 64] + - [138, 33.0] + - - [64, 17, 1, 15, 64, 64, 64, 17] + - [130, 11.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [138, 12.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [138, 32.0] + - - [64, 30, 1, 31, 64, 64, 64, 30] + - [138, 33.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [138, 34.0] + - - [14, 64, 1, 14, 14, 14, 14, 64] + - [187, 9.0] + - - [15, 64, 1, 14, 15, 15, 15, 64] + - [124, 9.0] + - - [15, 64, 1, 15, 15, 15, 15, 64] + - [128, 10.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [189, 10.0] + - - [64, 15, 1, 14, 64, 64, 64, 15] + - [189, 10.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [157, 10.0] + - - [1024, 2, 1, 32, 1024, 1024, 1024, 2] + - [124, 37.0] + - - [1024, 2, 1, 4, 1024, 1024, 1024, 2] + - [130, 7.0] + - - [512, 32, 1, 1600, 512, 512, 512, 32] + - [174, 1647.0] + - - [1024, 64, 1, 960, 1024, 1024, 1024, 64] + - [131, 3767.0] + - - [512, 64, 1, 512, 512, 512, 512, 64] + - [132, 2066.0] + - - [384, 128, 1, 128, 384, 384, 384, 128] + - [124, 1872.0] + - - [384, 128, 1, 256, 384, 384, 384, 128] + - [124, 2263.0] + - - [64, 64, 1, 64, 64, 64, 64, 64] + - [140, 125.0] + - - [256, 4, 1, 4096, 256, 256, 256, 4] + - [193, 120.0] + - - [25, 256, 120, 128, 25, 25, 25, 256] + - [153, 4304.0] + - - [25, 256, 18, 128, 25, 25, 25, 256] + - [153, 2516.0] + - - [25, 256, 19, 128, 25, 25, 25, 256] + - [153, 2620.0] + - - [9, 256, 120, 128, 9, 9, 9, 256] + - [183, 1657.0] + - - [9, 256, 18, 128, 9, 9, 9, 256] + - [128, 1075.0] + - - [9, 256, 19, 128, 9, 9, 9, 256] + - [128, 1116.0] + - - [1024, 2, 1, 10, 1024, 1024, 1024, 2] + - [130, 14.0] + - - [1024, 2, 1, 1280, 1024, 1024, 1024, 2] + - [204, 184.0] + - - [1024, 2, 1, 39, 1024, 1024, 1024, 2] + - [124, 39.0] + - - [1024, 2, 1, 40, 1024, 1024, 1024, 2] + - [155, 43.0] + - - [1024, 2, 1, 41, 1024, 1024, 1024, 2] + - [155, 43.0] + - - [1024, 2, 1, 5, 1024, 1024, 1024, 2] + - [130, 8.0] + - - [1024, 2, 1, 2560, 1024, 1024, 1024, 2] + - [136, 219.0] + - - [1024, 2, 1, 8, 1024, 1024, 1024, 2] + - [130, 12.0] + - - [1024, 2, 1, 1024, 1024, 1024, 1024, 2] + - [164, 194.0] + - - [1024, 2, 1, 9, 1024, 1024, 1024, 2] + - [130, 14.0] + - - [1024, 2, 1, 1152, 1024, 1024, 1024, 2] + - [136, 199.0] + - - [4, 64, 32768, 4, 4, 4, 4, 64] + - [153, 377.0] + - - [4, 64, 38400, 4, 4, 4, 4, 64] + - [123, 367.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [154, 376.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [125, 376.0] + - - [14, 64, 10880, 14, 14, 14, 14, 64] + - [153, 2235.0] + - - [15, 64, 10880, 14, 15, 15, 15, 64] + - [180, 2293.0] + - - [15, 64, 7680, 15, 15, 15, 15, 64] + - [196, 2602.0] + - - [15, 64, 10880, 15, 15, 15, 15, 64] + - [180, 2382.0] + - - [17, 64, 7680, 15, 17, 17, 17, 64] + - [180, 2041.0] + - - [17, 64, 6144, 17, 17, 17, 17, 64] + - [153, 2327.0] + - - [17, 64, 7680, 17, 17, 17, 17, 64] + - [123, 2365.0] + - - [21, 64, 6144, 17, 21, 21, 21, 64] + - [153, 2601.0] + - - [21, 64, 6144, 21, 21, 21, 21, 64] + - [123, 3044.0] + - - [24, 64, 4736, 24, 24, 24, 24, 64] + - [123, 4015.0] + - - [30, 64, 2048, 30, 30, 30, 30, 64] + - [169, 3959.0] + - - [30, 64, 2048, 31, 30, 30, 30, 64] + - [123, 3902.0] + - - [31, 64, 2048, 31, 31, 31, 31, 64] + - [169, 4061.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [142, 2349.0] + - - [64, 15, 10880, 14, 64, 64, 64, 15] + - [125, 2427.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [142, 2748.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [125, 2509.0] + - - [64, 17, 7680, 15, 64, 64, 64, 17] + - [125, 2876.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [125, 2902.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [125, 3054.0] + - - [64, 21, 6144, 17, 64, 64, 64, 21] + - [125, 3218.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [125, 3850.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [125, 4231.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [154, 4999.0] + - - [64, 30, 2048, 31, 64, 64, 64, 30] + - [170, 5071.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [170, 5133.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [130, 2002.0] + - - [5, 64, 1, 5, 5, 5, 5, 64] + - [122, 1.0] + - - [33, 32, 1, 33, 33, 33, 33, 32] + - [124, 18.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1] + - [136, 104.0] + - - [5, 64, 960, 5, 5, 5, 5, 64] + - [153, 361.0] + - - [27, 128, 32768, 27, 27, 27, 27, 128] + - [190, 2012.0] + - - [1024, 2, 1, 16, 1024, 1024, 1024, 2] + - [138, 17.0] + - - [1024, 2, 1, 64, 1024, 1024, 1024, 2] + - [126, 56.0] + - - [13, 512, 1, 3456, 13, 13, 13, 512] + - [166, 764.0] + - - [13, 512, 1, 4096, 13, 13, 13, 512] + - [138, 783.0] + - - [13, 512, 1, 864, 13, 13, 13, 512] + - [187, 564.0] + - - [256, 1, 1, 3456, 256, 256, 256, 1] + - [151, 30.0] + - - [256, 1, 1, 4096, 256, 256, 256, 1] + - [136, 30.0] + - - [256, 1, 1, 864, 256, 256, 256, 1] + - [132, 24.0] + - - [256, 128, 1, 3456, 256, 256, 256, 128] + - [128, 2415.0] + - - [256, 128, 1, 4096, 256, 256, 256, 128] + - [128, 2429.0] + - - [256, 128, 1, 864, 256, 256, 256, 128] + - [132, 2208.0] + - - [1024, 2, 1, 80, 1024, 1024, 1024, 2] + - [195, 67.0] + - - [1024, 2, 1, 82, 1024, 1024, 1024, 2] + - [140, 65.0] + - - [1024, 2, 1, 12, 1024, 1024, 1024, 2] + - [130, 17.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [197, 3725.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [125, 4032.0] + - - [1024, 2, 1, 128, 1024, 1024, 1024, 2] + - [155, 79.0] + - - [1024, 2, 1, 96, 1024, 1024, 1024, 2] + - [132, 73.0] + - - [768, 2, 1, 2048, 768, 768, 768, 2] + - [164, 162.0] + - - [1024, 81, 1, 1024, 1024, 1024, 1024, 81] + - [180, 3782.0] + - - [2, 1024, 1, 6, 2, 2, 2, 1024] + - [124, 9.0] + - - [1024, 2, 1, 20, 1024, 1024, 1024, 2] + - [138, 25.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB_GB.yaml new file mode 100644 index 000000000..a8e27f912 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bjlk_SB_GB.yaml @@ -0,0 +1,69105 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 256 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 4 + LSPA: 1 + LSPB: 16 + LVCA: 64 + LVCB: 4 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 4 + LSPA: 1 + LSPB: 32 + LVCA: 128 + LVCB: 4 + LVPA: 1 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x32x16_SN_SU32_SUM3_TT1_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT16x64x16_SN_SU0_SUM0_TT1_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_GB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] + - [1, 11980.0] + - - [1600, 1024, 1, 512, 1600, 1600, 1600, 1024] + - [0, 10488.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 1024] + - [19, 12373.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 1024] + - [16, 12322.0] + - - [3072, 768, 1, 4096, 3072, 3072, 3072, 768] + - [3, 11543.0] + - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] + - [31, 11945.0] + - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] + - [16, 11992.0] + - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] + - [20, 11659.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 4288] + - [19, 12263.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 5888] + - [1, 11927.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1024] + - [19, 12209.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 1856] + - [34, 12202.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 704] + - [0, 11419.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 2944] + - [35, 12775.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 4288] + - [0, 11391.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 5056] + - [0, 11517.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 5056] + - [35, 12502.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 5888] + - [31, 12329.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3584] + - [3, 11819.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1408] + - [1, 12318.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 2368] + - [30, 11386.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1856] + - [34, 11877.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 5056] + - [19, 12462.0] + - - [448, 5056, 1, 256, 448, 448, 448, 5056] + - [15, 9818.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 1408] + - [0, 10401.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 256] + - [18, 11453.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 4288] + - [35, 12528.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 448] + - [0, 10761.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 2368] + - [34, 11633.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 2944] + - [16, 12258.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 5056] + - [1, 11200.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 704] + - [23, 10976.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [2, 10373.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 4288] + - [35, 11877.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 1024] + - [15, 11761.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 5056] + - [6, 10643.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 2944] + - [20, 11718.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 5056] + - [19, 12676.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 5056] + - [1, 12167.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 6784] + - [0, 11700.0] + - - [704, 5056, 1, 128, 704, 704, 704, 5056] + - [30, 10229.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 2944] + - [1, 12202.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 6784] + - [19, 12782.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 4288] + - [16, 12142.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 4288] + - [16, 12444.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 704] + - [0, 10825.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 4288] + - [35, 12466.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 2368] + - [19, 12196.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 448] + - [30, 11708.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 2944] + - [36, 11371.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 2944] + - [9, 11948.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 704] + - [15, 11771.0] + - - [448, 5888, 1, 128, 448, 448, 448, 5888] + - [23, 9416.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 2368] + - [31, 12208.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 3584] + - [20, 9928.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 5888] + - [19, 12595.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 2944] + - [20, 12026.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 1408] + - [0, 10875.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 2368] + - [18, 11604.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 6784] + - [5, 11421.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 704] + - [34, 11907.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 1856] + - [0, 11475.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 5056] + - [16, 11857.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 1856] + - [30, 11972.0] + - - [704, 5888, 1, 256, 704, 704, 704, 5888] + - [20, 10764.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 6784] + - [19, 12803.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 704] + - [34, 11584.0] + - - [448, 4288, 1, 256, 448, 448, 448, 4288] + - [20, 9498.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 2368] + - [36, 10238.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 2368] + - [18, 11587.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 4288] + - [19, 11805.0] + - - [704, 2944, 1, 128, 704, 704, 704, 2944] + - [0, 9536.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1024] + - [30, 11381.0] + - - [704, 6784, 1, 256, 704, 704, 704, 6784] + - [20, 10938.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 704] + - [30, 11554.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 1408] + - [1, 11425.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 4288] + - [19, 12499.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1856] + - [18, 12165.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 1024] + - [1, 12425.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 4288] + - [23, 11327.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 3584] + - [1, 12155.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 6784] + - [19, 12265.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3584] + - [19, 12462.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 2944] + - [1, 12464.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 2368] + - [18, 12047.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 2368] + - [0, 11233.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 6784] + - [4, 12694.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 5888] + - [19, 12275.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 4288] + - [35, 12426.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 5888] + - [36, 12065.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 5888] + - [1, 11278.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 3584] + - [5, 11380.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 5888] + - [35, 12834.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 5056] + - [16, 12298.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 1024] + - [5, 11370.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 2368] + - [30, 11634.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 448] + - [30, 11377.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 5888] + - [4, 11337.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 2368] + - [4, 12260.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 2944] + - [31, 12068.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 1024] + - [1, 11468.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 5056] + - [31, 12517.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 1856] + - [3, 11563.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 2368] + - [15, 11282.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 4288] + - [4, 12339.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 4288] + - [0, 11815.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 448] + - [15, 10767.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 6784] + - [19, 11468.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 5888] + - [4, 12785.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1024] + - [16, 12345.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 5888] + - [15, 10235.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 2944] + - [30, 11607.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 5888] + - [19, 12626.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 5888] + - [31, 11930.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 3584] + - [30, 11489.0] + - - [448, 3584, 1, 128, 448, 448, 448, 3584] + - [23, 8716.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 2944] + - [4, 12738.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 5888] + - [13, 11559.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 5888] + - [4, 12421.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 704] + - [0, 10037.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 2944] + - [4, 12414.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 2368] + - [30, 11744.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 704] + - [8, 10963.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 1408] + - [35, 12503.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 1024] + - [35, 12253.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 2944] + - [19, 12804.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 5056] + - [19, 12622.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 1856] + - [0, 11104.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 5888] + - [23, 11581.0] + - - [2048, 7133, 1, 2048, 2048, 2048, 2048, 7133] + - [16, 12628.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 5888] + - [35, 12477.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 4288] + - [16, 12018.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1856] + - [18, 11841.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 2944] + - [16, 11740.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 6784] + - [3, 11452.0] + - - [256, 5056, 1, 128, 256, 256, 256, 5056] + - [15, 9885.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 1024] + - [20, 11794.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 1856] + - [35, 12150.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 1408] + - [8, 10670.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 1408] + - [5, 11358.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 5056] + - [1, 12120.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 256] + - [5, 11952.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 5888] + - [1, 12205.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 2368] + - [30, 11898.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 3584] + - [31, 12313.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1024] + - [13, 11662.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 4288] + - [19, 12502.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1856] + - [3, 11451.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 2944] + - [1, 12175.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 5056] + - [31, 12407.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 1856] + - [30, 11834.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 1024] + - [8, 10237.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 3584] + - [31, 11892.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 5888] + - [4, 12817.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 1024] + - [31, 11771.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 2368] + - [15, 11822.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 5888] + - [35, 12670.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 1024] + - [25, 10568.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 448] + - [34, 11491.0] + - - [448, 5888, 1, 256, 448, 448, 448, 5888] + - [15, 9890.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 6784] + - [35, 12397.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 704] + - [0, 10668.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 2944] + - [31, 12212.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 5888] + - [1, 12317.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 1856] + - [0, 11175.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3584] + - [4, 12176.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 6784] + - [16, 11957.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1408] + - [1, 12433.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 4288] + - [30, 11825.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 1856] + - [30, 11135.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 5888] + - [19, 12396.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 6784] + - [16, 12587.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 2368] + - [0, 11535.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 4288] + - [0, 11217.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 2944] + - [1, 11769.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1856] + - [40, 12156.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 6784] + - [16, 12331.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 5056] + - [4, 12346.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 448] + - [30, 10891.0] + - - [448, 4288, 1, 128, 448, 448, 448, 4288] + - [15, 9121.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 4288] + - [4, 12315.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [15, 10818.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 4288] + - [15, 11656.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 704] + - [18, 11651.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 3584] + - [35, 12157.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 5056] + - [4, 12274.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 2368] + - [16, 12172.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 3584] + - [30, 11750.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 3584] + - [19, 12435.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 2368] + - [30, 11575.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 2944] + - [35, 12483.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 6784] + - [31, 12395.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 3584] + - [28, 11041.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 448] + - [30, 11422.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 6784] + - [4, 12094.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 1856] + - [15, 11789.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 1856] + - [15, 11058.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 6784] + - [4, 12362.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 3584] + - [19, 12498.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 5888] + - [9, 12235.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 5888] + - [13, 11949.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 2368] + - [35, 12273.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 4288] + - [0, 11926.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 3584] + - [19, 12114.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 6784] + - [19, 12300.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1408] + - [30, 11148.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 704] + - [30, 11346.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 5888] + - [1, 12008.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 2944] + - [5, 11005.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 2368] + - [39, 10949.0] + - - [4096, 7133, 1, 4096, 4096, 4096, 4096, 7133] + - [19, 12715.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 704] + - [15, 11498.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 6784] + - [36, 12156.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 448] + - [18, 11615.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 2368] + - [36, 10390.0] + - - [256, 5888, 1, 128, 256, 256, 256, 5888] + - [30, 10007.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 2944] + - [19, 12439.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 2368] + - [30, 11959.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 704] + - [18, 11844.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 4288] + - [15, 11951.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 2944] + - [31, 12236.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 704] + - [30, 10816.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 5056] + - [23, 11802.0] + - - [448, 5056, 1, 128, 448, 448, 448, 5056] + - [23, 9089.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 5056] + - [0, 11643.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 3584] + - [24, 11800.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 2368] + - [30, 11911.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 5056] + - [4, 12636.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 5056] + - [30, 11474.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3584] + - [4, 12526.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 6784] + - [16, 12576.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 2944] + - [19, 12318.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 704] + - [0, 11600.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 5056] + - [36, 11676.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 5888] + - [35, 12780.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 5888] + - [35, 12811.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 4288] + - [19, 12483.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 3584] + - [31, 12066.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 1856] + - [8, 10791.0] + - - [704, 3584, 1, 128, 704, 704, 704, 3584] + - [30, 9925.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 448] + - [30, 11437.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 4288] + - [1, 11920.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 2944] + - [13, 11696.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 6784] + - [19, 12252.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 2944] + - [19, 12623.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 3584] + - [31, 12059.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 1408] + - [15, 11186.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 3584] + - [0, 11818.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 704] + - [0, 11179.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 1408] + - [4, 12177.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 3584] + - [15, 10814.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 6784] + - [35, 12767.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 6784] + - [1, 11831.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 448] + - [30, 11619.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 4288] + - [0, 12049.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 704] + - [15, 11772.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 1024] + - [35, 12277.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 6784] + - [1, 11384.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 5056] + - [19, 12148.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 3584] + - [23, 11048.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 1408] + - [0, 10185.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 2944] + - [20, 11373.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 2944] + - [36, 11830.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 5056] + - [4, 12673.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 2368] + - [30, 10994.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 2368] + - [30, 11841.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 6784] + - [4, 12665.0] + - - [3072, 7435, 1, 1024, 3072, 3072, 3072, 7435] + - [4, 12519.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 5888] + - [31, 11715.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 4288] + - [35, 12386.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1408] + - [16, 12130.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 5056] + - [0, 11920.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 2368] + - [0, 11754.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 5056] + - [31, 12572.0] + - - [448, 6784, 1, 256, 448, 448, 448, 6784] + - [28, 9960.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 2368] + - [12, 12299.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 1856] + - [19, 11879.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 448] + - [8, 10094.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1024] + - [15, 11751.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 5056] + - [24, 11600.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 4288] + - [0, 11736.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3584] + - [19, 12706.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3584] + - [19, 12629.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1408] + - [23, 11573.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 2944] + - [31, 12572.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 6784] + - [30, 11802.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 1408] + - [36, 11371.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 6784] + - [35, 12665.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 4288] + - [1, 12074.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 6784] + - [19, 12261.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 4288] + - [9, 12197.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 1408] + - [5, 11957.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 1024] + - [0, 11458.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 5888] + - [24, 12539.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1024] + - [9, 11919.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 6784] + - [31, 10531.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1408] + - [31, 12143.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 1856] + - [34, 11947.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 2944] + - [0, 11740.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 5888] + - [19, 12651.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1856] + - [34, 12208.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 5056] + - [1, 12173.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 5888] + - [35, 12740.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 256] + - [36, 11257.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 5888] + - [35, 12155.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 1408] + - [0, 11695.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3584] + - [15, 10903.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 448] + - [8, 11638.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 1856] + - [12, 12233.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 1024] + - [0, 11253.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 2368] + - [15, 10991.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 4288] + - [1, 11971.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1408] + - [39, 11323.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 5056] + - [1, 12358.0] + - - [448, 6784, 1, 128, 448, 448, 448, 6784] + - [15, 9410.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 6784] + - [1, 12489.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 2368] + - [15, 11551.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 3584] + - [31, 12158.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1408] + - [16, 12082.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 448] + - [23, 9781.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 256] + - [13, 11782.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 1408] + - [30, 11308.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 4288] + - [30, 12025.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 448] + - [0, 11130.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 2368] + - [35, 12259.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1856] + - [20, 11599.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 1856] + - [8, 12005.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 5888] + - [15, 11826.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 2368] + - [0, 11411.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 1408] + - [15, 10812.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 2368] + - [30, 11090.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 1408] + - [20, 11736.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 5888] + - [40, 12218.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 5056] + - [20, 11862.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 3584] + - [15, 11754.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 5056] + - [4, 12591.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 1024] + - [8, 9593.0] + - - [704, 4288, 1, 256, 704, 704, 704, 4288] + - [30, 10271.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 2368] + - [1, 12299.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 5888] + - [19, 12292.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 256] + - [5, 11094.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 1856] + - [18, 11648.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 704] + - [0, 11049.0] + - - [704, 3584, 1, 256, 704, 704, 704, 3584] + - [30, 10418.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 2944] + - [20, 11128.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 1024] + - [30, 11599.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 1024] + - [30, 11707.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 5056] + - [30, 11915.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 6784] + - [31, 12052.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 1408] + - [35, 12390.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 6784] + - [31, 11947.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 2944] + - [42, 12756.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 1856] + - [23, 11372.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 2944] + - [0, 11018.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 448] + - [15, 11102.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 5056] + - [20, 10562.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 1856] + - [0, 10911.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 704] + - [3, 11081.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 704] + - [30, 11524.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 1024] + - [0, 11201.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 5888] + - [5, 11268.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 4288] + - [16, 12255.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 4288] + - [16, 12250.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [17, 10329.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1024] + - [35, 12216.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 1024] + - [0, 11598.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 3584] + - [19, 12662.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 6784] + - [4, 12182.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 2944] + - [1, 12167.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 2368] + - [18, 11732.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 1856] + - [30, 11454.0] + - - [256, 6784, 1, 128, 256, 256, 256, 6784] + - [15, 10486.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 6784] + - [5, 11981.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 5056] + - [30, 11739.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 5888] + - [36, 11352.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 5888] + - [35, 12480.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 1856] + - [0, 11813.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 3584] + - [9, 12434.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 4288] + - [34, 10623.0] + - - [704, 5888, 1, 128, 704, 704, 704, 5888] + - [15, 10355.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 3584] + - [24, 12113.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 5056] + - [4, 12467.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 1408] + - [0, 10183.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 2368] + - [30, 11828.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 704] + - [3, 11700.0] + - - [448, 3584, 1, 256, 448, 448, 448, 3584] + - [20, 9279.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1024] + - [1, 11303.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 1408] + - [19, 12253.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 1408] + - [34, 11528.0] + - - [2560, 7133, 1, 2560, 2560, 2560, 2560, 7133] + - [19, 12723.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 3584] + - [4, 12391.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 1856] + - [18, 11965.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1408] + - [19, 12346.0] + - - [704, 2944, 1, 256, 704, 704, 704, 2944] + - [30, 10207.0] + - - [704, 4288, 1, 128, 704, 704, 704, 4288] + - [39, 9913.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 4288] + - [0, 11449.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 6784] + - [23, 11608.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1408] + - [30, 11092.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 4288] + - [16, 10305.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 1408] + - [0, 11163.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 5056] + - [30, 12027.0] + - - [704, 2368, 1, 256, 704, 704, 704, 2368] + - [8, 9612.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 2368] + - [19, 12352.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 448] + - [0, 11423.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 704] + - [18, 11827.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 256] + - [0, 9467.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 5888] + - [15, 11702.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 1024] + - [15, 10869.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 1856] + - [23, 10559.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 6784] + - [35, 12633.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 5056] + - [19, 11307.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 5056] + - [19, 12636.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 2944] + - [1, 12242.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 3584] + - [1, 11680.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 3584] + - [16, 12579.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 2944] + - [31, 11648.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 2368] + - [3, 12142.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 1408] + - [36, 11726.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 1408] + - [19, 12263.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 1024] + - [30, 10950.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 5056] + - [1, 12411.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 6784] + - [4, 12801.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 5056] + - [30, 12025.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 1408] + - [4, 12392.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [15, 10800.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3584] + - [5, 10043.0] + - - [704, 2368, 1, 128, 704, 704, 704, 2368] + - [15, 9119.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 256] + - [36, 9986.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 1856] + - [23, 11603.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 4288] + - [30, 11715.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 1024] + - [5, 11837.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 5056] + - [20, 11783.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 1408] + - [18, 11691.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 448] + - [30, 11059.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 6784] + - [1, 12273.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 5056] + - [19, 12638.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 704] + - [15, 11786.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 6784] + - [35, 12485.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 1408] + - [1, 11991.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 5888] + - [19, 12098.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 5888] + - [31, 12357.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 1024] + - [10, 10651.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 1856] + - [15, 11649.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 6784] + - [31, 12198.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 3584] + - [1, 12096.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1856] + - [19, 12085.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 1024] + - [3, 11499.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 3584] + - [4, 12667.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3584] + - [4, 12222.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 2944] + - [0, 11280.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 6784] + - [16, 10460.0] + - - [704, 5056, 1, 256, 704, 704, 704, 5056] + - [20, 10695.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 1024] + - [30, 11801.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1856] + - [18, 11899.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [0, 10661.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3584] + - [19, 12186.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 704] + - [30, 11520.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 3584] + - [31, 12419.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 5888] + - [16, 12094.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 448] + - [3, 11774.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 1408] + - [13, 11709.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 1408] + - [30, 11713.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 2368] + - [18, 12017.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 2368] + - [30, 12038.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 1856] + - [15, 11606.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 2944] + - [23, 11264.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 1024] + - [5, 11410.0] + - - [1760, 7133, 1, 1760, 1760, 1760, 1760, 7133] + - [4, 12200.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 256] + - [15, 10188.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 704] + - [15, 11236.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 4288] + - [41, 11976.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 5056] + - [4, 12252.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 1024] + - [16, 12021.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 5056] + - [27, 12404.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 1856] + - [36, 11774.0] + - - [704, 6784, 1, 128, 704, 704, 704, 6784] + - [17, 10421.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 6784] + - [9, 12254.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 2944] + - [35, 12458.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 2944] + - [1, 12380.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 6784] + - [16, 11632.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 4288] + - [5, 10159.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 4288] + - [12, 12064.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 3584] + - [30, 11469.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 5056] + - [8, 11293.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 6784] + - [4, 12787.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 5888] + - [30, 10149.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 448] + - [8, 10691.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 2944] + - [0, 11748.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 256] + - [30, 11374.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 5888] + - [1, 12345.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1856] + - [30, 11620.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 4288] + - [0, 11964.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 4288] + - [4, 12489.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 704] + - [0, 10264.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 1408] + - [0, 11489.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 5056] + - [4, 12063.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1024] + - [30, 11618.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 5888] + - [19, 12683.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 4288] + - [0, 11605.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 2368] + - [34, 11828.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 1856] + - [19, 12247.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 2944] + - [15, 10821.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 3584] + - [34, 11755.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 5888] + - [35, 12489.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 2944] + - [34, 11676.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 6784] + - [31, 12397.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 6784] + - [15, 11374.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 3584] + - [31, 11692.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 1856] + - [0, 11843.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 704] + - [30, 10758.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 5888] + - [36, 11146.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 6784] + - [35, 12500.0] + - - [7680, 5481, 1, 2560, 7680, 7680, 7680, 5481] + - [19, 12754.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 1408] + - [0, 10723.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 1024] + - [0, 10264.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3584] + - [19, 12674.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 5056] + - [16, 12185.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 2368] + - [34, 11643.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 4288] + - [0, 11805.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 2944] + - [24, 12076.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [15, 10929.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 5056] + - [5, 11873.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 1856] + - [23, 11490.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 1408] + - [12, 11963.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3584] + - [4, 12469.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 2368] + - [30, 11714.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 6784] + - [4, 12809.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 4288] + - [0, 10540.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 448] + - [18, 11761.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 1024] + - [5, 10986.0] + - - [196, 256, 256, 1024, 196, 196, 196, 256] + - [4, 9525.0] + - - [784, 512, 256, 128, 784, 784, 784, 512] + - [4, 10709.0] + - - [784, 128, 128, 512, 784, 784, 784, 128] + - [4, 10871.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 256] + - [0, 8245.0] + - - [784, 128, 256, 512, 784, 784, 784, 128] + - [4, 10991.0] + - - [196, 256, 128, 1024, 196, 196, 196, 256] + - [4, 9420.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 256] + - [0, 9064.0] + - - [784, 512, 128, 128, 784, 784, 784, 512] + - [36, 10603.0] + - - [196, 1024, 128, 256, 196, 196, 196, 1024] + - [4, 9277.0] + - - [196, 1024, 256, 256, 196, 196, 196, 1024] + - [20, 9371.0] + - - [5329, 160, 64, 64, 5329, 5329, 5329, 160] + - [34, 7070.0] + - - [1225, 384, 64, 192, 1225, 1225, 1225, 384] + - [5, 11544.0] + - - [289, 1024, 64, 256, 289, 289, 289, 1024] + - [36, 9180.0] + - - [1225, 384, 64, 64, 1225, 1225, 1225, 384] + - [0, 10415.0] + - - [1225, 384, 64, 96, 1225, 1225, 1225, 384] + - [3, 10383.0] + - - [289, 1024, 64, 384, 289, 289, 289, 1024] + - [16, 9357.0] + - - [289, 1024, 64, 192, 289, 289, 289, 1024] + - [1, 9070.0] + - - [289, 1024, 64, 128, 289, 289, 289, 1024] + - [0, 8872.0] + - - [4096, 1024, 1, 2984, 4096, 4096, 4096, 1024] + - [31, 12341.0] + - - [1024, 4096, 1, 3437, 1024, 1024, 1024, 4096] + - [19, 12342.0] + - - [1024, 4096, 1, 3235, 1024, 1024, 1024, 4096] + - [19, 12356.0] + - - [4096, 1024, 1, 4032, 4096, 4096, 4096, 1024] + - [19, 12354.0] + - - [1024, 4096, 1, 3334, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [4096, 1024, 1, 3288, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 3515, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [4096, 1024, 1, 3437, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [1024, 4096, 1, 3259, 1024, 1024, 1024, 4096] + - [16, 12337.0] + - - [1024, 4096, 1, 3384, 1024, 1024, 1024, 4096] + - [19, 12363.0] + - - [4096, 1024, 1, 3458, 4096, 4096, 4096, 1024] + - [12, 12344.0] + - - [1024, 4096, 1, 3412, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3529, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [1024, 4096, 1, 4032, 1024, 1024, 1024, 4096] + - [4, 12363.0] + - - [4096, 1024, 1, 3999, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [1024, 4096, 1, 3079, 1024, 1024, 1024, 4096] + - [19, 12332.0] + - - [1024, 4096, 1, 3876, 1024, 1024, 1024, 4096] + - [19, 12368.0] + - - [1024, 4096, 1, 3450, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [1024, 4096, 1, 3256, 1024, 1024, 1024, 4096] + - [16, 12365.0] + - - [4096, 1024, 1, 3403, 4096, 4096, 4096, 1024] + - [4, 12341.0] + - - [1024, 4096, 1, 3359, 1024, 1024, 1024, 4096] + - [16, 12357.0] + - - [4096, 1024, 1, 3549, 4096, 4096, 4096, 1024] + - [27, 12353.0] + - - [4096, 1024, 1, 3176, 4096, 4096, 4096, 1024] + - [19, 12356.0] + - - [1024, 4096, 1, 3504, 1024, 1024, 1024, 4096] + - [19, 12349.0] + - - [4096, 1024, 1, 3314, 4096, 4096, 4096, 1024] + - [16, 12354.0] + - - [4096, 1024, 1, 3183, 4096, 4096, 4096, 1024] + - [35, 12372.0] + - - [1024, 4096, 1, 3209, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [1024, 4096, 1, 3720, 1024, 1024, 1024, 4096] + - [16, 12364.0] + - - [1024, 4096, 1, 3859, 1024, 1024, 1024, 4096] + - [16, 12361.0] + - - [1024, 33708, 1, 4059, 1024, 1024, 1024, 33708] + - [4, 12728.0] + - - [4096, 1024, 1, 3477, 4096, 4096, 4096, 1024] + - [4, 12342.0] + - - [4096, 1024, 1, 3233, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3409, 4096, 4096, 4096, 1024] + - [16, 12347.0] + - - [4096, 1024, 1, 3564, 4096, 4096, 4096, 1024] + - [16, 12367.0] + - - [4096, 1024, 1, 3190, 4096, 4096, 4096, 1024] + - [1, 12351.0] + - - [1024, 4096, 1, 3288, 1024, 1024, 1024, 4096] + - [35, 12340.0] + - - [4096, 1024, 1, 3451, 4096, 4096, 4096, 1024] + - [1, 12353.0] + - - [1024, 4096, 1, 3348, 1024, 1024, 1024, 4096] + - [19, 12340.0] + - - [1024, 4096, 1, 3465, 1024, 1024, 1024, 4096] + - [19, 12345.0] + - - [1024, 33708, 1, 4032, 1024, 1024, 1024, 33708] + - [19, 12731.0] + - - [1024, 33708, 1, 3840, 1024, 1024, 1024, 33708] + - [19, 12734.0] + - - [4096, 1024, 1, 3391, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [1024, 4096, 1, 3530, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3209, 4096, 4096, 4096, 1024] + - [1, 12338.0] + - - [1024, 4096, 1, 3457, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3386, 1024, 1024, 1024, 4096] + - [16, 12341.0] + - - [4096, 1024, 1, 3350, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3184, 1024, 1024, 1024, 4096] + - [19, 12345.0] + - - [1024, 4096, 1, 3093, 1024, 1024, 1024, 4096] + - [1, 12357.0] + - - [1024, 4096, 1, 3400, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [1024, 4096, 1, 3214, 1024, 1024, 1024, 4096] + - [4, 12359.0] + - - [4096, 1024, 1, 3406, 4096, 4096, 4096, 1024] + - [16, 12342.0] + - - [1024, 4096, 1, 3565, 1024, 1024, 1024, 4096] + - [12, 12350.0] + - - [4096, 1024, 1, 3536, 4096, 4096, 4096, 1024] + - [19, 12352.0] + - - [1024, 4096, 1, 3183, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [1024, 4096, 1, 3462, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [4096, 1024, 1, 3130, 4096, 4096, 4096, 1024] + - [16, 12337.0] + - - [4096, 1024, 1, 3381, 4096, 4096, 4096, 1024] + - [16, 12344.0] + - - [4096, 1024, 1, 3298, 4096, 4096, 4096, 1024] + - [1, 12342.0] + - - [1024, 4096, 1, 3292, 1024, 1024, 1024, 4096] + - [16, 12339.0] + - - [4096, 1024, 1, 3289, 4096, 4096, 4096, 1024] + - [16, 12344.0] + - - [1024, 4096, 1, 3379, 1024, 1024, 1024, 4096] + - [19, 12345.0] + - - [1024, 4096, 1, 3990, 1024, 1024, 1024, 4096] + - [19, 12356.0] + - - [1024, 4096, 1, 3540, 1024, 1024, 1024, 4096] + - [19, 12356.0] + - - [4096, 1024, 1, 3412, 4096, 4096, 4096, 1024] + - [16, 12348.0] + - - [1024, 4096, 1, 3555, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3518, 1024, 1024, 1024, 4096] + - [16, 12353.0] + - - [4096, 1024, 1, 3189, 4096, 4096, 4096, 1024] + - [19, 12336.0] + - - [1024, 4096, 1, 3298, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 4096, 1024] + - [1, 12343.0] + - - [1024, 4096, 1, 3393, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [1024, 4096, 1, 3207, 1024, 1024, 1024, 4096] + - [1, 12359.0] + - - [4096, 1024, 1, 3487, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [4096, 1024, 1, 3431, 4096, 4096, 4096, 1024] + - [27, 12352.0] + - - [4096, 1024, 1, 3378, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [4096, 1024, 1, 3529, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 3460, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [1024, 4096, 1, 3336, 1024, 1024, 1024, 4096] + - [19, 12339.0] + - - [1024, 4096, 1, 3501, 1024, 1024, 1024, 4096] + - [19, 12361.0] + - - [1024, 4096, 1, 3584, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4096, 1024, 1, 2499, 4096, 4096, 4096, 1024] + - [16, 12323.0] + - - [4096, 1024, 1, 3352, 4096, 4096, 4096, 1024] + - [4, 12355.0] + - - [1024, 4096, 1, 3543, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3476, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [1024, 33708, 1, 3822, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [1024, 4096, 1, 3436, 1024, 1024, 1024, 4096] + - [4, 12340.0] + - - [1024, 4096, 1, 3594, 1024, 1024, 1024, 4096] + - [16, 12352.0] + - - [4096, 1024, 1, 3514, 4096, 4096, 4096, 1024] + - [4, 12338.0] + - - [1024, 4096, 1, 3064, 1024, 1024, 1024, 4096] + - [19, 12334.0] + - - [4096, 1024, 1, 3371, 4096, 4096, 4096, 1024] + - [4, 12338.0] + - - [4096, 1024, 1, 3558, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [4096, 1024, 1, 3517, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3144, 4096, 4096, 4096, 1024] + - [16, 12342.0] + - - [1024, 4096, 1, 3312, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3079, 4096, 4096, 4096, 1024] + - [1, 12333.0] + - - [1024, 4096, 1, 3415, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [1024, 4096, 1, 3221, 1024, 1024, 1024, 4096] + - [19, 12350.0] + - - [1024, 4096, 1, 3978, 1024, 1024, 1024, 4096] + - [16, 12347.0] + - - [4096, 1024, 1, 3876, 4096, 4096, 4096, 1024] + - [19, 12356.0] + - - [1024, 4096, 1, 3528, 1024, 1024, 1024, 4096] + - [16, 12347.0] + - - [1024, 4096, 1, 3181, 1024, 1024, 1024, 4096] + - [1, 12339.0] + - - [4096, 1024, 1, 3445, 4096, 4096, 4096, 1024] + - [1, 12342.0] + - - [4096, 1024, 1, 3450, 4096, 4096, 4096, 1024] + - [1, 12343.0] + - - [4096, 1024, 1, 3377, 4096, 4096, 4096, 1024] + - [19, 12339.0] + - - [1024, 4096, 1, 3532, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [1024, 33708, 1, 3944, 1024, 1024, 1024, 33708] + - [4, 12728.0] + - - [4096, 1024, 1, 3483, 4096, 4096, 4096, 1024] + - [4, 12341.0] + - - [1024, 4096, 1, 3358, 1024, 1024, 1024, 4096] + - [4, 12341.0] + - - [4096, 1024, 1, 3464, 4096, 4096, 4096, 1024] + - [16, 12348.0] + - - [4096, 1024, 1, 3282, 4096, 4096, 4096, 1024] + - [4, 12342.0] + - - [4096, 1024, 1, 3256, 4096, 4096, 4096, 1024] + - [27, 12354.0] + - - [1024, 4096, 1, 3057, 1024, 1024, 1024, 4096] + - [16, 12336.0] + - - [4096, 1024, 1, 3481, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [4096, 1024, 1, 3340, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 3273, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [4096, 1024, 1, 3392, 4096, 4096, 4096, 1024] + - [4, 12354.0] + - - [4096, 1024, 1, 3337, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3359, 4096, 4096, 4096, 1024] + - [1, 12342.0] + - - [4096, 1024, 1, 3498, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [4096, 1024, 1, 3169, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 33708, 1, 3859, 1024, 1024, 1024, 33708] + - [35, 12730.0] + - - [1024, 4096, 1, 3103, 1024, 1024, 1024, 4096] + - [16, 12337.0] + - - [4096, 1024, 1, 3900, 4096, 4096, 4096, 1024] + - [19, 12353.0] + - - [1024, 4096, 1, 3442, 1024, 1024, 1024, 4096] + - [19, 12343.0] + - - [1024, 4096, 1, 3248, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3351, 1024, 1024, 1024, 4096] + - [4, 12364.0] + - - [4096, 1024, 1, 3593, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3780, 1024, 1024, 1024, 4096] + - [19, 12354.0] + - - [1024, 33708, 1, 3681, 1024, 1024, 1024, 33708] + - [4, 12728.0] + - - [4096, 1024, 1, 3374, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3557, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [4096, 1024, 1, 3906, 4096, 4096, 4096, 1024] + - [35, 12341.0] + - - [4096, 1024, 1, 3504, 4096, 4096, 4096, 1024] + - [19, 12359.0] + - - [1024, 4096, 1, 3270, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [4096, 1024, 1, 3098, 4096, 4096, 4096, 1024] + - [16, 12339.0] + - - [4096, 1024, 1, 3216, 4096, 4096, 4096, 1024] + - [12, 12350.0] + - - [1024, 4096, 1, 3550, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [4096, 1024, 1, 3449, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 4096, 1, 3403, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3523, 1024, 1024, 1024, 4096] + - [16, 12347.0] + - - [1024, 4096, 1, 3486, 1024, 1024, 1024, 4096] + - [19, 12342.0] + - - [1024, 4096, 1, 3564, 1024, 1024, 1024, 4096] + - [19, 12347.0] + - - [1024, 33708, 1, 4005, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3296, 4096, 4096, 4096, 1024] + - [4, 12348.0] + - - [1024, 4096, 1, 3263, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3130, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [1024, 4096, 1, 3295, 1024, 1024, 1024, 4096] + - [1, 12336.0] + - - [1024, 33708, 1, 3925, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [1024, 4096, 1, 3378, 1024, 1024, 1024, 4096] + - [27, 12330.0] + - - [4096, 1024, 1, 3720, 4096, 4096, 4096, 1024] + - [1, 12352.0] + - - [4096, 1024, 1, 3399, 4096, 4096, 4096, 1024] + - [35, 12343.0] + - - [4096, 1024, 1, 3543, 4096, 4096, 4096, 1024] + - [19, 12361.0] + - - [4096, 1024, 1, 3497, 4096, 4096, 4096, 1024] + - [19, 12339.0] + - - [4096, 1024, 1, 3594, 4096, 4096, 4096, 1024] + - [4, 12348.0] + - - [1024, 4096, 1, 3144, 1024, 1024, 1024, 4096] + - [4, 12338.0] + - - [1024, 4096, 1, 3975, 1024, 1024, 1024, 4096] + - [19, 12354.0] + - - [4096, 1024, 1, 3205, 4096, 4096, 4096, 1024] + - [19, 12337.0] + - - [1024, 33708, 1, 3995, 1024, 1024, 1024, 33708] + - [19, 12730.0] + - - [1024, 4096, 1, 3392, 1024, 1024, 1024, 4096] + - [4, 12353.0] + - - [1024, 4096, 1, 3055, 1024, 1024, 1024, 4096] + - [1, 12346.0] + - - [1024, 4096, 1, 4026, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4096, 1024, 1, 3557, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [4096, 1024, 1, 3515, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3486, 4096, 4096, 4096, 1024] + - [35, 12363.0] + - - [4096, 1024, 1, 3457, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3511, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [4096, 1024, 1, 3138, 4096, 4096, 4096, 1024] + - [16, 12338.0] + - - [1024, 4096, 1, 3339, 1024, 1024, 1024, 4096] + - [19, 12343.0] + - - [1024, 4096, 1, 3939, 1024, 1024, 1024, 4096] + - [19, 12354.0] + - - [4096, 1024, 1, 3500, 4096, 4096, 4096, 1024] + - [4, 12346.0] + - - [4096, 1024, 1, 3395, 4096, 4096, 4096, 1024] + - [12, 12346.0] + - - [4096, 1024, 1, 4020, 4096, 4096, 4096, 1024] + - [19, 12354.0] + - - [4096, 1024, 1, 3942, 4096, 4096, 4096, 1024] + - [19, 12349.0] + - - [4096, 1024, 1, 3349, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [1024, 4096, 1, 3322, 1024, 1024, 1024, 4096] + - [16, 12348.0] + - - [4096, 1024, 1, 3452, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3417, 1024, 1024, 1024, 4096] + - [16, 12349.0] + - - [1024, 4096, 1, 3526, 1024, 1024, 1024, 4096] + - [16, 12348.0] + - - [4096, 1024, 1, 3485, 4096, 4096, 4096, 1024] + - [19, 12366.0] + - - [4096, 1024, 1, 3303, 4096, 4096, 4096, 1024] + - [4, 12360.0] + - - [4096, 1024, 1, 3344, 4096, 4096, 4096, 1024] + - [12, 12350.0] + - - [1024, 4096, 1, 3479, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [4096, 1024, 1, 3300, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [1024, 4096, 1, 3439, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [4096, 1024, 1, 3280, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [1024, 4096, 1, 3245, 1024, 1024, 1024, 4096] + - [1, 12333.0] + - - [1024, 4096, 1, 3328, 1024, 1024, 1024, 4096] + - [19, 12349.0] + - - [4096, 1024, 1, 3418, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3493, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3500, 1024, 1024, 1024, 4096] + - [16, 12350.0] + - - [1024, 4096, 1, 3166, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [4096, 1024, 1, 3126, 4096, 4096, 4096, 1024] + - [19, 12335.0] + - - [1024, 4096, 1, 3277, 1024, 1024, 1024, 4096] + - [4, 12341.0] + - - [1024, 4096, 1, 3315, 1024, 1024, 1024, 4096] + - [1, 12363.0] + - - [1024, 4096, 1, 3414, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [4096, 1024, 1, 3531, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3484, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [1024, 4096, 1, 3180, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3360, 4096, 4096, 4096, 1024] + - [1, 12343.0] + - - [1024, 33708, 1, 3990, 1024, 1024, 1024, 33708] + - [4, 12728.0] + - - [4096, 1024, 1, 3466, 4096, 4096, 4096, 1024] + - [4, 12341.0] + - - [1024, 4096, 1, 3428, 1024, 1024, 1024, 4096] + - [4, 12340.0] + - - [1024, 4096, 1, 3137, 1024, 1024, 1024, 4096] + - [1, 12337.0] + - - [4096, 1024, 1, 4059, 4096, 4096, 4096, 1024] + - [19, 12353.0] + - - [1024, 4096, 1, 3353, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3942, 1024, 1024, 1024, 4096] + - [4, 12356.0] + - - [4096, 1024, 1, 3506, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3508, 4096, 4096, 4096, 1024] + - [4, 12342.0] + - - [4096, 1024, 1, 3956, 4096, 4096, 4096, 1024] + - [4, 12352.0] + - - [1024, 4096, 1, 3272, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [1024, 4096, 1, 3443, 1024, 1024, 1024, 4096] + - [1, 12346.0] + - - [1024, 4096, 1, 3375, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [1024, 4096, 1, 3525, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3472, 4096, 4096, 4096, 1024] + - [4, 12348.0] + - - [1024, 4096, 1, 3520, 1024, 1024, 1024, 4096] + - [19, 12347.0] + - - [4096, 1024, 1, 3322, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [4096, 1024, 1, 3387, 4096, 4096, 4096, 1024] + - [19, 12338.0] + - - [1024, 33708, 1, 3939, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3345, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [4096, 1024, 1, 2967, 4096, 4096, 4096, 1024] + - [16, 12333.0] + - - [1024, 4096, 1, 3453, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [1024, 4096, 1, 3640, 1024, 1024, 1024, 4096] + - [16, 12354.0] + - - [4096, 1024, 1, 3291, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [1024, 4096, 1, 3350, 1024, 1024, 1024, 4096] + - [19, 12341.0] + - - [4096, 1024, 1, 3417, 4096, 4096, 4096, 1024] + - [1, 12338.0] + - - [1024, 4096, 1, 3467, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [1024, 4096, 1, 3491, 1024, 1024, 1024, 4096] + - [1, 12342.0] + - - [1024, 4096, 1, 3822, 1024, 1024, 1024, 4096] + - [19, 12350.0] + - - [4096, 1024, 1, 3292, 4096, 4096, 4096, 1024] + - [1, 12336.0] + - - [1024, 4096, 1, 3231, 1024, 1024, 1024, 4096] + - [16, 12339.0] + - - [1024, 4096, 1, 3364, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [1024, 4096, 1, 3995, 1024, 1024, 1024, 4096] + - [4, 12370.0] + - - [1024, 4096, 1, 3545, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [1024, 4096, 1, 3186, 1024, 1024, 1024, 4096] + - [1, 12344.0] + - - [4096, 1024, 1, 3432, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [4096, 1024, 1, 3367, 4096, 4096, 4096, 1024] + - [1, 12338.0] + - - [4096, 1024, 1, 3503, 4096, 4096, 4096, 1024] + - [27, 12336.0] + - - [1024, 4096, 1, 3095, 1024, 1024, 1024, 4096] + - [4, 12340.0] + - - [4096, 1024, 1, 3465, 4096, 4096, 4096, 1024] + - [35, 12352.0] + - - [1024, 4096, 1, 3402, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [4096, 1024, 1, 3140, 4096, 4096, 4096, 1024] + - [16, 12341.0] + - - [4096, 1024, 1, 3424, 4096, 4096, 4096, 1024] + - [19, 12352.0] + - - [4096, 1024, 1, 3257, 4096, 4096, 4096, 1024] + - [16, 12340.0] + - - [4096, 1024, 1, 2917, 4096, 4096, 4096, 1024] + - [16, 12352.0] + - - [1024, 33708, 1, 3640, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [1024, 4096, 1, 3456, 1024, 1024, 1024, 4096] + - [4, 12352.0] + - - [1024, 4096, 1, 3014, 1024, 1024, 1024, 4096] + - [4, 12335.0] + - - [4096, 1024, 1, 3372, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [1024, 4096, 1, 3294, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [4096, 1024, 1, 3446, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3389, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [4096, 1024, 1, 3259, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [4096, 1024, 1, 3544, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [4096, 1024, 1, 3479, 4096, 4096, 4096, 1024] + - [1, 12340.0] + - - [4096, 1024, 1, 3542, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3321, 4096, 4096, 4096, 1024] + - [1, 12343.0] + - - [1024, 4096, 1, 3147, 1024, 1024, 1024, 4096] + - [19, 12333.0] + - - [1024, 4096, 1, 3944, 1024, 1024, 1024, 4096] + - [4, 12352.0] + - - [4096, 1024, 1, 3870, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 4096, 1, 3308, 1024, 1024, 1024, 4096] + - [4, 12340.0] + - - [4096, 1024, 1, 3401, 4096, 4096, 4096, 1024] + - [19, 12356.0] + - - [1024, 4096, 1, 3395, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [1024, 4096, 1, 3563, 1024, 1024, 1024, 4096] + - [19, 12339.0] + - - [1024, 33708, 1, 3870, 1024, 1024, 1024, 33708] + - [4, 12730.0] + - - [4096, 1024, 1, 3494, 4096, 4096, 4096, 1024] + - [19, 12339.0] + - - [1024, 4096, 1, 3271, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 33708, 1, 3910, 1024, 1024, 1024, 33708] + - [19, 12732.0] + - - [1024, 4096, 1, 3287, 1024, 1024, 1024, 4096] + - [4, 12341.0] + - - [1024, 33708, 1, 3860, 1024, 1024, 1024, 33708] + - [4, 12730.0] + - - [4096, 1024, 1, 3341, 4096, 4096, 4096, 1024] + - [19, 12338.0] + - - [1024, 4096, 1, 3136, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [4096, 1024, 1, 3439, 4096, 4096, 4096, 1024] + - [4, 12344.0] + - - [1024, 4096, 1, 3751, 1024, 1024, 1024, 4096] + - [4, 12350.0] + - - [1024, 4096, 1, 3301, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [4096, 1024, 1, 3468, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3416, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [4096, 1024, 1, 3163, 4096, 4096, 4096, 1024] + - [19, 12333.0] + - - [1024, 4096, 1, 3230, 1024, 1024, 1024, 4096] + - [16, 12336.0] + - - [1024, 4096, 1, 3581, 1024, 1024, 1024, 4096] + - [12, 12348.0] + - - [4096, 1024, 1, 3463, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [1024, 4096, 1, 3478, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3262, 4096, 4096, 4096, 1024] + - [16, 12338.0] + - - [1024, 4096, 1, 3438, 1024, 1024, 1024, 4096] + - [4, 12337.0] + - - [1024, 4096, 1, 3244, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 4096, 1, 3445, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 4096, 1024] + - [16, 12340.0] + - - [1024, 4096, 1, 3492, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3211, 4096, 4096, 4096, 1024] + - [1, 12331.0] + - - [1024, 4096, 1, 3910, 1024, 1024, 1024, 4096] + - [19, 12354.0] + - - [1024, 4096, 1, 3314, 1024, 1024, 1024, 4096] + - [19, 12340.0] + - - [4096, 1024, 1, 3859, 4096, 4096, 4096, 1024] + - [4, 12348.0] + - - [4096, 1024, 1, 3383, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3409, 1024, 1024, 1024, 4096] + - [4, 12339.0] + - - [1024, 4096, 1, 4020, 1024, 1024, 1024, 4096] + - [19, 12356.0] + - - [4096, 1024, 1, 3530, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 3411, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 3566, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3493, 4096, 4096, 4096, 1024] + - [4, 12348.0] + - - [4096, 1024, 1, 3184, 4096, 4096, 4096, 1024] + - [12, 12344.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [1024, 4096, 1, 3431, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3306, 4096, 4096, 4096, 1024] + - [27, 12345.0] + - - [1024, 4096, 1, 3352, 1024, 1024, 1024, 4096] + - [16, 12349.0] + - - [4096, 1024, 1, 3295, 4096, 4096, 4096, 1024] + - [4, 12338.0] + - - [1024, 4096, 1, 3517, 1024, 1024, 1024, 4096] + - [16, 12327.0] + - - [4096, 1024, 1, 3426, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 3385, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3572, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 3459, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 3374, 1024, 1024, 1024, 4096] + - [4, 12341.0] + - - [4096, 1024, 1, 3166, 4096, 4096, 4096, 1024] + - [16, 12333.0] + - - [4096, 1024, 1, 3093, 4096, 4096, 4096, 1024] + - [1, 12331.0] + - - [4096, 1024, 1, 3523, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 3413, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [1024, 4096, 1, 3996, 1024, 1024, 1024, 4096] + - [19, 12352.0] + - - [1024, 4096, 1, 3452, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [4096, 1024, 1, 3232, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3400, 4096, 4096, 4096, 1024] + - [1, 12346.0] + - - [4096, 1024, 1, 3334, 4096, 4096, 4096, 1024] + - [16, 12350.0] + - - [1024, 4096, 1, 3345, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [1024, 4096, 1, 3538, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [1024, 4096, 1, 3466, 1024, 1024, 1024, 4096] + - [19, 12345.0] + - - [4096, 1024, 1, 3315, 4096, 4096, 4096, 1024] + - [35, 12339.0] + - - [4096, 1024, 1, 3214, 4096, 4096, 4096, 1024] + - [4, 12340.0] + - - [1024, 33708, 1, 3900, 1024, 1024, 1024, 33708] + - [4, 12732.0] + - - [1024, 4096, 1, 3367, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [1024, 4096, 1, 2917, 1024, 1024, 1024, 4096] + - [1, 12333.0] + - - [1024, 4096, 1, 3544, 1024, 1024, 1024, 4096] + - [16, 12351.0] + - - [4096, 1024, 1, 3414, 4096, 4096, 4096, 1024] + - [4, 12343.0] + - - [4096, 1024, 1, 3565, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [1024, 4096, 1, 3512, 1024, 1024, 1024, 4096] + - [16, 12347.0] + - - [1024, 4096, 1, 3191, 1024, 1024, 1024, 4096] + - [16, 12338.0] + - - [1024, 4096, 1, 3289, 1024, 1024, 1024, 4096] + - [16, 12341.0] + - - [4096, 1024, 1, 3290, 4096, 4096, 4096, 1024] + - [19, 12337.0] + - - [1024, 4096, 1, 3211, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [1024, 33708, 1, 3969, 1024, 1024, 1024, 33708] + - [4, 12729.0] + - - [4096, 1024, 1, 3566, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3459, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 4096, 1, 3372, 1024, 1024, 1024, 4096] + - [19, 12362.0] + - - [4096, 1024, 1, 3339, 4096, 4096, 4096, 1024] + - [19, 12339.0] + - - [4096, 1024, 1, 3425, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [4096, 1024, 1, 3388, 4096, 4096, 4096, 1024] + - [35, 12352.0] + - - [1024, 4096, 1, 3531, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [4096, 1024, 1, 3286, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [4096, 1024, 1, 3462, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3388, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [4096, 1024, 1, 3165, 4096, 4096, 4096, 1024] + - [19, 12335.0] + - - [4096, 1024, 1, 3304, 4096, 4096, 4096, 1024] + - [19, 12365.0] + - - [1024, 4096, 1, 2736, 1024, 1024, 1024, 4096] + - [4, 12336.0] + - - [4096, 1024, 1, 3397, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3311, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 4096, 1, 3394, 1024, 1024, 1024, 4096] + - [1, 12350.0] + - - [4096, 1024, 1, 2736, 4096, 4096, 4096, 1024] + - [4, 12331.0] + - - [1024, 4096, 1, 3559, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3180, 4096, 4096, 4096, 1024] + - [19, 12338.0] + - - [1024, 4096, 1, 3480, 1024, 1024, 1024, 4096] + - [16, 12352.0] + - - [4096, 1024, 1, 3318, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 3213, 4096, 4096, 4096, 1024] + - [4, 12337.0] + - - [1024, 4096, 1, 3286, 1024, 1024, 1024, 4096] + - [19, 12341.0] + - - [4096, 1024, 1, 3471, 4096, 4096, 4096, 1024] + - [4, 12347.0] + - - [1024, 4096, 1, 3381, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [4096, 1024, 1, 3502, 4096, 4096, 4096, 1024] + - [16, 12339.0] + - - [1024, 4096, 1, 3552, 1024, 1024, 1024, 4096] + - [4, 12353.0] + - - [4096, 1024, 1, 3519, 4096, 4096, 4096, 1024] + - [4, 12343.0] + - - [1024, 4096, 1, 3300, 1024, 1024, 1024, 4096] + - [19, 12341.0] + - - [1024, 4096, 1, 3419, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [4096, 1024, 1, 4030, 4096, 4096, 4096, 1024] + - [19, 12354.0] + - - [4096, 1024, 1, 3976, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [1024, 4096, 1, 3473, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3428, 4096, 4096, 4096, 1024] + - [16, 12339.0] + - - [1024, 4096, 1, 3433, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [4096, 1024, 1, 3534, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 3461, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 3681, 4096, 4096, 4096, 1024] + - [4, 12343.0] + - - [4096, 1024, 1, 3495, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 3351, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 4059, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [4096, 1024, 1, 3990, 4096, 4096, 4096, 1024] + - [27, 12347.0] + - - [1024, 4096, 1, 3325, 1024, 1024, 1024, 4096] + - [1, 12343.0] + - - [1024, 4096, 1, 3408, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [4096, 1024, 1, 3394, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3573, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [4096, 1024, 1, 3386, 4096, 4096, 4096, 1024] + - [4, 12344.0] + - - [4096, 1024, 1, 3540, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [1024, 4096, 1, 3182, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3430, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [1024, 4096, 1, 3236, 1024, 1024, 1024, 4096] + - [1, 12344.0] + - - [4096, 1024, 1, 2977, 4096, 4096, 4096, 1024] + - [1, 12335.0] + - - [1024, 4096, 1, 3355, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [4096, 1024, 1, 3139, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [4096, 1024, 1, 3516, 4096, 4096, 4096, 1024] + - [19, 12360.0] + - - [4096, 1024, 1, 3368, 4096, 4096, 4096, 1024] + - [27, 12347.0] + - - [4096, 1024, 1, 3559, 4096, 4096, 4096, 1024] + - [35, 12342.0] + - - [1024, 4096, 1, 3506, 1024, 1024, 1024, 4096] + - [16, 12351.0] + - - [1024, 4096, 1, 3145, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [1024, 4096, 1, 3369, 1024, 1024, 1024, 4096] + - [19, 12347.0] + - - [4096, 1024, 1, 3522, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [1024, 33708, 1, 3894, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3336, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3382, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3533, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 4050, 4096, 4096, 4096, 1024] + - [19, 12364.0] + - - [4096, 1024, 1, 3480, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3344, 1024, 1024, 1024, 4096] + - [16, 12349.0] + - - [1024, 4096, 1, 3509, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3956, 1024, 1024, 1024, 4096] + - [4, 12352.0] + - - [4096, 1024, 1, 3616, 4096, 4096, 4096, 1024] + - [19, 12353.0] + - - [1024, 4096, 1, 3366, 1024, 1024, 1024, 4096] + - [4, 12341.0] + - - [4096, 1024, 1, 2935, 4096, 4096, 4096, 1024] + - [16, 12335.0] + - - [4096, 1024, 1, 3393, 4096, 4096, 4096, 1024] + - [19, 12349.0] + - - [4096, 1024, 1, 3547, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [1024, 4096, 1, 3499, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [4096, 1024, 1, 3357, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3272, 4096, 4096, 4096, 1024] + - [1, 12342.0] + - - [4096, 1024, 1, 3207, 4096, 4096, 4096, 1024] + - [4, 12338.0] + - - [4096, 1024, 1, 3894, 4096, 4096, 4096, 1024] + - [4, 12349.0] + - - [1024, 4096, 1, 3444, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [4096, 1024, 1, 3561, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3376, 4096, 4096, 4096, 1024] + - [4, 12346.0] + - - [1024, 4096, 1, 3458, 1024, 1024, 1024, 4096] + - [19, 12349.0] + - - [4096, 1024, 1, 3231, 4096, 4096, 4096, 1024] + - [19, 12335.0] + - - [1024, 4096, 1, 3505, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [4096, 1024, 1, 3277, 4096, 4096, 4096, 1024] + - [19, 12336.0] + - - [1024, 4096, 1, 3391, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3536, 1024, 1024, 1024, 4096] + - [16, 12355.0] + - - [1024, 4096, 1, 3063, 1024, 1024, 1024, 4096] + - [1, 12335.0] + - - [1024, 4096, 1, 3189, 1024, 1024, 1024, 4096] + - [16, 12338.0] + - - [1024, 4096, 1, 2505, 1024, 1024, 1024, 4096] + - [1, 12321.0] + - - [4096, 1024, 1, 3454, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [1024, 4096, 1, 3405, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [1024, 33708, 1, 4050, 1024, 1024, 1024, 33708] + - [4, 12730.0] + - - [4096, 1024, 1, 3520, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [1024, 4096, 1, 3487, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 4096, 1, 3558, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [4096, 1024, 1, 3297, 4096, 4096, 4096, 1024] + - [19, 12338.0] + - - [1024, 4096, 1, 3483, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [1024, 33708, 1, 3751, 1024, 1024, 1024, 33708] + - [19, 12730.0] + - - [4096, 1024, 1, 3380, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [1024, 4096, 1, 3380, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 4096, 1, 3396, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3497, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [1024, 4096, 1, 3502, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [1024, 4096, 1, 3138, 1024, 1024, 1024, 4096] + - [19, 12340.0] + - - [4096, 1024, 1, 3939, 4096, 4096, 4096, 1024] + - [19, 12354.0] + - - [1024, 4096, 1, 3303, 1024, 1024, 1024, 4096] + - [19, 12342.0] + - - [1024, 4096, 1, 3418, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [1024, 4096, 1, 3224, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3978, 4096, 4096, 4096, 1024] + - [4, 12350.0] + - - [1024, 4096, 1, 3472, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [4096, 1024, 1, 3353, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [4096, 1024, 1, 3362, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 33708, 1, 3978, 1024, 1024, 1024, 33708] + - [19, 12731.0] + - - [1024, 4096, 1, 3432, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [1024, 4096, 1, 3139, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3341, 1024, 1024, 1024, 4096] + - [12, 12342.0] + - - [1024, 4096, 1, 3494, 1024, 1024, 1024, 4096] + - [19, 12347.0] + - - [1024, 4096, 1, 3969, 1024, 1024, 1024, 4096] + - [4, 12355.0] + - - [1024, 4096, 1, 3163, 1024, 1024, 1024, 4096] + - [16, 12338.0] + - - [4096, 1024, 1, 3405, 4096, 4096, 4096, 1024] + - [4, 12344.0] + - - [4096, 1024, 1, 3453, 4096, 4096, 4096, 1024] + - [35, 12352.0] + - - [1024, 4096, 1, 3411, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [1024, 4096, 1, 3527, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3474, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3572, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [4096, 1024, 1, 3293, 4096, 4096, 4096, 1024] + - [19, 12338.0] + - - [4096, 1024, 1, 3247, 4096, 4096, 4096, 1024] + - [12, 12339.0] + - - [1024, 4096, 1, 3425, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 4096, 1, 3354, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [4096, 1024, 1, 3382, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3236, 4096, 4096, 4096, 1024] + - [35, 12348.0] + - - [1024, 4096, 1, 3519, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [4096, 1024, 1, 3354, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [4096, 1024, 1, 3501, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 3266, 4096, 4096, 4096, 1024] + - [1, 12337.0] + - - [1024, 4096, 1, 3368, 1024, 1024, 1024, 4096] + - [16, 12352.0] + - - [1024, 4096, 1, 4030, 1024, 1024, 1024, 4096] + - [19, 12356.0] + - - [1024, 4096, 1, 3533, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [4096, 1024, 1, 3332, 4096, 4096, 4096, 1024] + - [4, 12340.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 4096, 1024] + - [4, 12349.0] + - - [1024, 4096, 1, 3616, 1024, 1024, 1024, 4096] + - [4, 12358.0] + - - [4096, 1024, 1, 3265, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [4096, 1024, 1, 3361, 4096, 4096, 4096, 1024] + - [35, 12356.0] + - - [4096, 1024, 1, 3467, 4096, 4096, 4096, 1024] + - [1, 12341.0] + - - [1024, 4096, 1, 3454, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [1024, 4096, 1, 3101, 1024, 1024, 1024, 4096] + - [19, 12338.0] + - - [1024, 4096, 1, 3508, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [4096, 1024, 1, 3267, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3419, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [4096, 1024, 1, 3822, 4096, 4096, 4096, 1024] + - [4, 12345.0] + - - [1024, 4096, 1, 3266, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [4096, 1024, 1, 3440, 4096, 4096, 4096, 1024] + - [4, 12347.0] + - - [1024, 4096, 1, 3361, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3546, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3473, 4096, 4096, 4096, 1024] + - [19, 12349.0] + - - [4096, 1024, 1, 3546, 4096, 4096, 4096, 1024] + - [4, 12345.0] + - - [1024, 4096, 1, 3088, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3535, 1024, 1024, 1024, 4096] + - [4, 12350.0] + - - [1024, 4096, 1, 3447, 1024, 1024, 1024, 4096] + - [1, 12347.0] + - - [1024, 4096, 1, 3560, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3422, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3469, 1024, 1024, 1024, 4096] + - [19, 12347.0] + - - [4096, 1024, 1, 3488, 4096, 4096, 4096, 1024] + - [19, 12354.0] + - - [1024, 4096, 1, 3110, 1024, 1024, 1024, 4096] + - [4, 12335.0] + - - [1024, 4096, 1, 3265, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [1024, 4096, 1, 3291, 1024, 1024, 1024, 4096] + - [16, 12339.0] + - - [1024, 4096, 1, 3390, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3046, 4096, 4096, 4096, 1024] + - [16, 12335.0] + - - [1024, 4096, 1, 3539, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [4096, 1024, 1, 3221, 4096, 4096, 4096, 1024] + - [19, 12336.0] + - - [4096, 1024, 1, 3433, 4096, 4096, 4096, 1024] + - [4, 12341.0] + - - [4096, 1024, 1, 3364, 4096, 4096, 4096, 1024] + - [35, 12351.0] + - - [4096, 1024, 1, 3470, 4096, 4096, 4096, 1024] + - [4, 12342.0] + - - [1024, 4096, 1, 3404, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [1024, 33708, 1, 3968, 1024, 1024, 1024, 33708] + - [19, 12737.0] + - - [4096, 1024, 1, 3088, 4096, 4096, 4096, 1024] + - [16, 12352.0] + - - [1024, 4096, 1, 3247, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 33708, 1, 3996, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3482, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [4096, 1024, 1, 3995, 4096, 4096, 4096, 1024] + - [19, 12354.0] + - - [1024, 4096, 1, 3280, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3271, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3545, 4096, 4096, 4096, 1024] + - [4, 12347.0] + - - [4096, 1024, 1, 3476, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3496, 4096, 4096, 4096, 1024] + - [1, 12340.0] + - - [4096, 1024, 1, 3191, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3311, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 3302, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3681, 1024, 1024, 1024, 4096] + - [4, 12350.0] + - - [4096, 1024, 1, 3582, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [4096, 1024, 1, 3421, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [4096, 1024, 1, 3560, 4096, 4096, 4096, 1024] + - [16, 12347.0] + - - [1024, 4096, 1, 3495, 1024, 1024, 1024, 4096] + - [19, 12343.0] + - - [4096, 1024, 1, 3186, 4096, 4096, 4096, 1024] + - [16, 12338.0] + - - [4096, 1024, 1, 3925, 4096, 4096, 4096, 1024] + - [19, 12353.0] + - - [1024, 4096, 1, 3435, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [4096, 1024, 1, 3434, 4096, 4096, 4096, 1024] + - [16, 12342.0] + - - [1024, 33708, 1, 4012, 1024, 1024, 1024, 33708] + - [19, 12729.0] + - - [1024, 4096, 1, 3340, 1024, 1024, 1024, 4096] + - [16, 12338.0] + - - [4096, 1024, 1, 3489, 4096, 4096, 4096, 1024] + - [4, 12341.0] + - - [1024, 4096, 1, 3162, 1024, 1024, 1024, 4096] + - [16, 12338.0] + - - [4096, 1024, 1, 3436, 4096, 4096, 4096, 1024] + - [19, 12336.0] + - - [4096, 1024, 1, 3574, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3469, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3410, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [1024, 4096, 1, 3216, 1024, 1024, 1024, 4096] + - [16, 12356.0] + - - [4096, 1024, 1, 3095, 4096, 4096, 4096, 1024] + - [16, 12341.0] + - - [4096, 1024, 1, 3448, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3176, 1024, 1024, 1024, 4096] + - [16, 12341.0] + - - [4096, 1024, 1, 2918, 4096, 4096, 4096, 1024] + - [16, 12332.0] + - - [1024, 4096, 1, 3424, 1024, 1024, 1024, 4096] + - [4, 12350.0] + - - [4096, 1024, 1, 3402, 4096, 4096, 4096, 1024] + - [4, 12339.0] + - - [4096, 1024, 1, 3145, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 33708, 1, 3976, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3518, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3110, 4096, 4096, 4096, 1024] + - [27, 12339.0] + - - [4096, 1024, 1, 3325, 4096, 4096, 4096, 1024] + - [35, 12350.0] + - - [1024, 33708, 1, 3999, 1024, 1024, 1024, 33708] + - [19, 12732.0] + - - [4096, 1024, 1, 2985, 4096, 4096, 4096, 1024] + - [16, 12335.0] + - - [1024, 4096, 1, 3371, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [4096, 1024, 1, 3342, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [4096, 1024, 1, 3141, 4096, 4096, 4096, 1024] + - [4, 12338.0] + - - [4096, 1024, 1, 3532, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [1024, 4096, 1, 3169, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 4096, 1, 3514, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [4096, 1024, 1, 3780, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 4096, 1, 3098, 1024, 1024, 1024, 4096] + - [16, 12339.0] + - - [1024, 4096, 1, 3449, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [1024, 4096, 1, 3222, 1024, 1024, 1024, 4096] + - [16, 12340.0] + - - [1024, 4096, 1, 3346, 1024, 1024, 1024, 4096] + - [4, 12357.0] + - - [4096, 1024, 1, 3064, 4096, 4096, 4096, 1024] + - [1, 12342.0] + - - [4096, 1024, 1, 3511, 4096, 4096, 4096, 1024] + - [19, 12356.0] + - - [4096, 1024, 1, 3384, 4096, 4096, 4096, 1024] + - [16, 12346.0] + - - [4096, 1024, 1, 3356, 4096, 4096, 4096, 1024] + - [27, 12343.0] + - - [1024, 4096, 1, 3796, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3427, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [4096, 1024, 1, 3390, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3573, 4096, 4096, 4096, 1024] + - [4, 12347.0] + - - [4096, 1024, 1, 3456, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 4096, 1, 3360, 1024, 1024, 1024, 4096] + - [19, 12351.0] + - - [1024, 33708, 1, 3977, 1024, 1024, 1024, 33708] + - [4, 12732.0] + - - [1024, 4096, 1, 2918, 1024, 1024, 1024, 4096] + - [1, 12331.0] + - - [4096, 1024, 1, 3975, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 3525, 4096, 4096, 4096, 1024] + - [19, 12349.0] + - - [4096, 1024, 1, 3398, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [4096, 1024, 1, 3640, 4096, 4096, 4096, 1024] + - [19, 12357.0] + - - [4096, 1024, 1, 3014, 4096, 4096, 4096, 1024] + - [16, 12337.0] + - - [1024, 4096, 1, 3446, 1024, 1024, 1024, 4096] + - [19, 12343.0] + - - [1024, 33708, 1, 3796, 1024, 1024, 1024, 33708] + - [4, 12729.0] + - - [4096, 1024, 1, 3101, 4096, 4096, 4096, 1024] + - [16, 12340.0] + - - [4096, 1024, 1, 3563, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3539, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3182, 4096, 4096, 4096, 1024] + - [16, 12338.0] + - - [1024, 4096, 1, 3468, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [4096, 1024, 1, 3312, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [4096, 1024, 1, 3215, 4096, 4096, 4096, 1024] + - [19, 12338.0] + - - [4096, 1024, 1, 3910, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [1024, 33708, 1, 3780, 1024, 1024, 1024, 33708] + - [19, 12732.0] + - - [1024, 4096, 1, 3290, 1024, 1024, 1024, 4096] + - [19, 12350.0] + - - [1024, 4096, 1, 4012, 1024, 1024, 1024, 4096] + - [4, 12354.0] + - - [1024, 4096, 1, 3385, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [1024, 33708, 1, 3975, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3996, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [4096, 1024, 1, 2765, 4096, 4096, 4096, 1024] + - [16, 12324.0] + - - [4096, 1024, 1, 3538, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [4096, 1024, 1, 3415, 4096, 4096, 4096, 1024] + - [19, 12352.0] + - - [1024, 4096, 1, 3554, 1024, 1024, 1024, 4096] + - [19, 12351.0] + - - [4096, 1024, 1, 3513, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [1024, 4096, 1, 3304, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3294, 4096, 4096, 4096, 1024] + - [16, 12340.0] + - - [4096, 1024, 1, 3396, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [1024, 4096, 1, 3213, 1024, 1024, 1024, 4096] + - [19, 12336.0] + - - [4096, 1024, 1, 3137, 4096, 4096, 4096, 1024] + - [16, 12342.0] + - - [4096, 1024, 1, 3552, 4096, 4096, 4096, 1024] + - [19, 12369.0] + - - [1024, 4096, 1, 3461, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [4096, 1024, 1, 3263, 4096, 4096, 4096, 1024] + - [16, 12341.0] + - - [4096, 1024, 1, 3430, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3389, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [4096, 1024, 1, 3528, 4096, 4096, 4096, 1024] + - [1, 12345.0] + - - [1024, 4096, 1, 3463, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [4096, 1024, 1, 3526, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3154, 4096, 4096, 4096, 1024] + - [16, 12342.0] + - - [4096, 1024, 1, 3499, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [4096, 1024, 1, 3955, 4096, 4096, 4096, 1024] + - [19, 12352.0] + - - [1024, 4096, 1, 3297, 1024, 1024, 1024, 4096] + - [4, 12338.0] + - - [1024, 4096, 1, 3233, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [1024, 4096, 1, 3226, 1024, 1024, 1024, 4096] + - [27, 12344.0] + - - [4096, 1024, 1, 3404, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [4096, 1024, 1, 3355, 4096, 4096, 4096, 1024] + - [1, 12339.0] + - - [1024, 4096, 1, 3542, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [4096, 1024, 1, 3181, 4096, 4096, 4096, 1024] + - [16, 12339.0] + - - [1024, 4096, 1, 3474, 1024, 1024, 1024, 4096] + - [19, 12349.0] + - - [4096, 1024, 1, 3319, 4096, 4096, 4096, 1024] + - [4, 12340.0] + - - [1024, 4096, 1, 3434, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 4096, 1, 3860, 1024, 1024, 1024, 4096] + - [4, 12350.0] + - - [1024, 4096, 1, 3343, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [1024, 4096, 1, 3488, 1024, 1024, 1024, 4096] + - [19, 12352.0] + - - [1024, 4096, 1, 3046, 1024, 1024, 1024, 4096] + - [16, 12337.0] + - - [1024, 4096, 1, 3141, 1024, 1024, 1024, 4096] + - [19, 12334.0] + - - [1024, 4096, 1, 3516, 1024, 1024, 1024, 4096] + - [1, 12344.0] + - - [4096, 1024, 1, 3147, 4096, 4096, 4096, 1024] + - [16, 12345.0] + - - [1024, 4096, 1, 3421, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [4096, 1024, 1, 3944, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [1024, 4096, 1, 3574, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [1024, 4096, 1, 3977, 1024, 1024, 1024, 4096] + - [19, 12358.0] + - - [1024, 4096, 1, 2985, 1024, 1024, 1024, 4096] + - [1, 12341.0] + - - [1024, 4096, 1, 3427, 1024, 1024, 1024, 4096] + - [27, 12350.0] + - - [1024, 4096, 1, 3482, 1024, 1024, 1024, 4096] + - [1, 12347.0] + - - [1024, 4096, 1, 3332, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [4096, 1024, 1, 3308, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [1024, 4096, 1, 3513, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [1024, 4096, 1, 3154, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [1024, 4096, 1, 3955, 1024, 1024, 1024, 4096] + - [4, 12356.0] + - - [1024, 4096, 1, 2967, 1024, 1024, 1024, 4096] + - [1, 12336.0] + - - [1024, 33708, 1, 3942, 1024, 1024, 1024, 33708] + - [19, 12731.0] + - - [1024, 4096, 1, 3319, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [4096, 1024, 1, 3860, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 4096, 1, 3548, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [4096, 1024, 1, 3977, 4096, 4096, 4096, 1024] + - [19, 12352.0] + - - [4096, 1024, 1, 3535, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [1024, 4096, 1, 3541, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 33708, 1, 3584, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [1024, 4096, 1, 3168, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [1024, 4096, 1, 3448, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3343, 4096, 4096, 4096, 1024] + - [16, 12339.0] + - - [1024, 4096, 1, 3357, 1024, 1024, 1024, 4096] + - [19, 12339.0] + - - [4096, 1024, 1, 3510, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3369, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3379, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3276, 1024, 1024, 1024, 4096] + - [4, 12340.0] + - - [1024, 4096, 1, 3363, 1024, 1024, 1024, 4096] + - [16, 12350.0] + - - [4096, 1024, 1, 3055, 4096, 4096, 4096, 1024] + - [1, 12339.0] + - - [1024, 4096, 1, 3524, 1024, 1024, 1024, 4096] + - [19, 12345.0] + - - [4096, 1024, 1, 3057, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [1024, 33708, 1, 3720, 1024, 1024, 1024, 33708] + - [19, 12728.0] + - - [1024, 4096, 1, 3383, 1024, 1024, 1024, 4096] + - [16, 12344.0] + - - [1024, 4096, 1, 3522, 1024, 1024, 1024, 4096] + - [19, 12354.0] + - - [1024, 33708, 1, 3956, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [1024, 4096, 1, 3481, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3562, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3299, 4096, 4096, 4096, 1024] + - [35, 12354.0] + - - [1024, 4096, 1, 3262, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 33708, 1, 4026, 1024, 1024, 1024, 33708] + - [4, 12734.0] + - - [4096, 1024, 1, 3168, 4096, 4096, 4096, 1024] + - [16, 12348.0] + - - [1024, 4096, 1, 3999, 1024, 1024, 1024, 4096] + - [19, 12350.0] + - - [1024, 4096, 1, 3549, 1024, 1024, 1024, 4096] + - [19, 12353.0] + - - [4096, 1024, 1, 3375, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3496, 1024, 1024, 1024, 4096] + - [16, 12349.0] + - - [1024, 4096, 1, 3190, 1024, 1024, 1024, 4096] + - [16, 12341.0] + - - [4096, 1024, 1, 3273, 4096, 4096, 4096, 1024] + - [4, 12339.0] + - - [1024, 4096, 1, 3406, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [4096, 1024, 1, 4005, 4096, 4096, 4096, 1024] + - [4, 12353.0] + - - [4096, 1024, 1, 3555, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [4096, 1024, 1, 2505, 4096, 4096, 4096, 1024] + - [35, 12332.0] + - - [1024, 4096, 1, 3460, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [1024, 4096, 1, 3579, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 33708, 1, 4030, 1024, 1024, 1024, 33708] + - [4, 12734.0] + - - [1024, 4096, 1, 3510, 1024, 1024, 1024, 4096] + - [4, 12349.0] + - - [1024, 4096, 1, 3282, 1024, 1024, 1024, 4096] + - [1, 12347.0] + - - [1024, 4096, 1, 3377, 1024, 1024, 1024, 4096] + - [4, 12362.0] + - - [1024, 4096, 1, 2935, 1024, 1024, 1024, 4096] + - [1, 12333.0] + - - [1024, 4096, 1, 3498, 1024, 1024, 1024, 4096] + - [16, 12348.0] + - - [1024, 4096, 1, 3593, 1024, 1024, 1024, 4096] + - [16, 12347.0] + - - [4096, 1024, 1, 3226, 4096, 4096, 4096, 1024] + - [16, 12341.0] + - - [1024, 4096, 1, 2499, 1024, 1024, 1024, 4096] + - [16, 12325.0] + - - [1024, 4096, 1, 3296, 1024, 1024, 1024, 4096] + - [19, 12351.0] + - - [1024, 4096, 1, 3455, 1024, 1024, 1024, 4096] + - [4, 12355.0] + - - [1024, 4096, 1, 3399, 1024, 1024, 1024, 4096] + - [12, 12351.0] + - - [1024, 4096, 1, 3205, 1024, 1024, 1024, 4096] + - [16, 12352.0] + - - [4096, 1024, 1, 4026, 4096, 4096, 4096, 1024] + - [19, 12356.0] + - - [1024, 4096, 1, 3484, 1024, 1024, 1024, 4096] + - [16, 12353.0] + - - [4096, 1024, 1, 3302, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [1024, 4096, 1, 3485, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3126, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 4096, 1, 4050, 1024, 1024, 1024, 4096] + - [19, 12355.0] + - - [4096, 1024, 1, 3235, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 33708, 1, 3955, 1024, 1024, 1024, 33708] + - [4, 12733.0] + - - [1024, 4096, 1, 3342, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 4096, 1, 3397, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [4096, 1024, 1, 3491, 4096, 4096, 4096, 1024] + - [19, 12349.0] + - - [1024, 4096, 1, 3503, 1024, 1024, 1024, 4096] + - [4, 12340.0] + - - [1024, 4096, 1, 3140, 1024, 1024, 1024, 4096] + - [19, 12345.0] + - - [4096, 1024, 1, 3121, 4096, 4096, 4096, 1024] + - [4, 12334.0] + - - [4096, 1024, 1, 3276, 4096, 4096, 4096, 1024] + - [19, 12340.0] + - - [1024, 4096, 1, 3321, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [1024, 4096, 1, 3870, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3475, 4096, 4096, 4096, 1024] + - [19, 12346.0] + - - [1024, 4096, 1, 2984, 1024, 1024, 1024, 4096] + - [16, 12339.0] + - - [4096, 1024, 1, 3363, 4096, 4096, 4096, 1024] + - [4, 12345.0] + - - [1024, 4096, 1, 3582, 1024, 1024, 1024, 4096] + - [16, 12347.0] + - - [4096, 1024, 1, 3509, 4096, 4096, 4096, 1024] + - [19, 12352.0] + - - [1024, 4096, 1, 3426, 1024, 1024, 1024, 4096] + - [4, 12345.0] + - - [4096, 1024, 1, 3136, 4096, 4096, 4096, 1024] + - [16, 12355.0] + - - [1024, 4096, 1, 3232, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3103, 4096, 4096, 4096, 1024] + - [16, 12341.0] + - - [1024, 4096, 1, 3335, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [1024, 4096, 1, 3900, 1024, 1024, 1024, 4096] + - [19, 12352.0] + - - [4096, 1024, 1, 3512, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [4096, 1024, 1, 3222, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [1024, 4096, 1, 3165, 1024, 1024, 1024, 4096] + - [19, 12343.0] + - - [4096, 1024, 1, 3408, 4096, 4096, 4096, 1024] + - [4, 12346.0] + - - [4096, 1024, 1, 3751, 4096, 4096, 4096, 1024] + - [19, 12349.0] + - - [1024, 4096, 1, 3318, 1024, 1024, 1024, 4096] + - [19, 12339.0] + - - [4096, 1024, 1, 3442, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 3413, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3524, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3976, 1024, 1024, 1024, 4096] + - [19, 12353.0] + - - [1024, 4096, 1, 3475, 1024, 1024, 1024, 4096] + - [4, 12347.0] + - - [1024, 4096, 1, 3534, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3301, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3248, 4096, 4096, 4096, 1024] + - [12, 12352.0] + - - [1024, 4096, 1, 2977, 1024, 1024, 1024, 4096] + - [16, 12332.0] + - - [4096, 1024, 1, 3346, 4096, 4096, 4096, 1024] + - [16, 12342.0] + - - [1024, 4096, 1, 3451, 1024, 1024, 1024, 4096] + - [16, 12353.0] + - - [1024, 4096, 1, 3257, 1024, 1024, 1024, 4096] + - [16, 12341.0] + - - [1024, 4096, 1, 3356, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3348, 4096, 4096, 4096, 1024] + - [16, 12341.0] + - - [4096, 1024, 1, 3335, 4096, 4096, 4096, 1024] + - [4, 12343.0] + - - [4096, 1024, 1, 3505, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [1024, 4096, 1, 3490, 1024, 1024, 1024, 4096] + - [4, 12348.0] + - - [4096, 1024, 1, 3447, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [1024, 4096, 1, 3267, 1024, 1024, 1024, 4096] + - [4, 12341.0] + - - [4096, 1024, 1, 3230, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3455, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [1024, 4096, 1, 3925, 1024, 1024, 1024, 4096] + - [4, 12356.0] + - - [1024, 4096, 1, 3362, 1024, 1024, 1024, 4096] + - [4, 12346.0] + - - [4096, 1024, 1, 3969, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [4096, 1024, 1, 3527, 4096, 4096, 4096, 1024] + - [19, 12350.0] + - - [1024, 4096, 1, 3585, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3063, 4096, 4096, 4096, 1024] + - [16, 12340.0] + - - [4096, 1024, 1, 3435, 4096, 4096, 4096, 1024] + - [4, 12345.0] + - - [4096, 1024, 1, 3366, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3581, 4096, 4096, 4096, 1024] + - [19, 12360.0] + - - [1024, 33708, 1, 3906, 1024, 1024, 1024, 33708] + - [4, 12730.0] + - - [1024, 4096, 1, 3464, 1024, 1024, 1024, 4096] + - [16, 12349.0] + - - [1024, 4096, 1, 3440, 1024, 1024, 1024, 4096] + - [4, 12351.0] + - - [4096, 1024, 1, 3143, 4096, 4096, 4096, 1024] + - [16, 12333.0] + - - [1024, 4096, 1, 3349, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [4096, 1024, 1, 3416, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3365, 4096, 4096, 4096, 1024] + - [16, 12346.0] + - - [1024, 4096, 1, 3470, 1024, 1024, 1024, 4096] + - [19, 12346.0] + - - [4096, 1024, 1, 3287, 4096, 4096, 4096, 1024] + - [19, 12339.0] + - - [1024, 4096, 1, 3441, 1024, 1024, 1024, 4096] + - [19, 12348.0] + - - [4096, 1024, 1, 3224, 4096, 4096, 4096, 1024] + - [16, 12343.0] + - - [1024, 4096, 1, 3387, 1024, 1024, 1024, 4096] + - [4, 12342.0] + - - [1024, 4096, 1, 3547, 1024, 1024, 1024, 4096] + - [19, 12349.0] + - - [4096, 1024, 1, 3478, 4096, 4096, 4096, 1024] + - [4, 12344.0] + - - [4096, 1024, 1, 3548, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 33708, 1, 4020, 1024, 1024, 1024, 33708] + - [4, 12731.0] + - - [4096, 1024, 1, 3320, 4096, 4096, 4096, 1024] + - [16, 12345.0] + - - [1024, 4096, 1, 3906, 1024, 1024, 1024, 4096] + - [4, 12353.0] + - - [4096, 1024, 1, 3796, 4096, 4096, 4096, 1024] + - [4, 12345.0] + - - [1024, 4096, 1, 3306, 1024, 1024, 1024, 4096] + - [4, 12336.0] + - - [1024, 4096, 1, 3401, 1024, 1024, 1024, 4096] + - [16, 12342.0] + - - [1024, 4096, 1, 3215, 1024, 1024, 1024, 4096] + - [16, 12336.0] + - - [4096, 1024, 1, 4012, 4096, 4096, 4096, 1024] + - [19, 12355.0] + - - [1024, 4096, 1, 2765, 1024, 1024, 1024, 4096] + - [19, 12326.0] + - - [4096, 1024, 1, 3554, 4096, 4096, 4096, 1024] + - [19, 12339.0] + - - [4096, 1024, 1, 3423, 4096, 4096, 4096, 1024] + - [19, 12341.0] + - - [1024, 4096, 1, 3562, 1024, 1024, 1024, 4096] + - [19, 12341.0] + - - [1024, 4096, 1, 3489, 1024, 1024, 1024, 4096] + - [16, 12341.0] + - - [4096, 1024, 1, 3358, 4096, 4096, 4096, 1024] + - [19, 12342.0] + - - [4096, 1024, 1, 3270, 4096, 4096, 4096, 1024] + - [16, 12346.0] + - - [1024, 4096, 1, 3293, 1024, 1024, 1024, 4096] + - [1, 12337.0] + - - [1024, 4096, 1, 3376, 1024, 1024, 1024, 4096] + - [1, 12348.0] + - - [4096, 1024, 1, 3245, 4096, 4096, 4096, 1024] + - [4, 12339.0] + - - [4096, 1024, 1, 3541, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [4096, 1024, 1, 3443, 4096, 4096, 4096, 1024] + - [27, 12350.0] + - - [4096, 1024, 1, 3438, 4096, 4096, 4096, 1024] + - [1, 12340.0] + - - [4096, 1024, 1, 3244, 4096, 4096, 4096, 1024] + - [35, 12353.0] + - - [1024, 4096, 1, 3365, 1024, 1024, 1024, 4096] + - [19, 12344.0] + - - [1024, 4096, 1, 3299, 1024, 1024, 1024, 4096] + - [16, 12339.0] + - - [1024, 4096, 1, 3471, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [1024, 4096, 1, 3398, 1024, 1024, 1024, 4096] + - [16, 12343.0] + - - [4096, 1024, 1, 3162, 4096, 4096, 4096, 1024] + - [19, 12343.0] + - - [1024, 4096, 1, 4005, 1024, 1024, 1024, 4096] + - [19, 12358.0] + - - [4096, 1024, 1, 3579, 4096, 4096, 4096, 1024] + - [19, 12351.0] + - - [1024, 4096, 1, 3121, 1024, 1024, 1024, 4096] + - [4, 12336.0] + - - [4096, 1024, 1, 3441, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3422, 4096, 4096, 4096, 1024] + - [12, 12346.0] + - - [4096, 1024, 1, 3444, 4096, 4096, 4096, 1024] + - [27, 12349.0] + - - [1024, 4096, 1, 3337, 1024, 1024, 1024, 4096] + - [4, 12343.0] + - - [4096, 1024, 1, 3550, 4096, 4096, 4096, 1024] + - [19, 12348.0] + - - [1024, 4096, 1, 3477, 1024, 1024, 1024, 4096] + - [16, 12346.0] + - - [4096, 1024, 1, 3490, 4096, 4096, 4096, 1024] + - [19, 12347.0] + - - [4096, 1024, 1, 3585, 4096, 4096, 4096, 1024] + - [4, 12343.0] + - - [1024, 4096, 1, 3143, 1024, 1024, 1024, 4096] + - [16, 12338.0] + - - [1024, 33708, 1, 3876, 1024, 1024, 1024, 33708] + - [4, 12733.0] + - - [1024, 4096, 1, 3320, 1024, 1024, 1024, 4096] + - [16, 12345.0] + - - [1024, 4096, 1, 3423, 1024, 1024, 1024, 4096] + - [16, 12360.0] + - - [1024, 4096, 1, 3894, 1024, 1024, 1024, 4096] + - [19, 12357.0] + - - [4096, 1024, 1, 3410, 4096, 4096, 4096, 1024] + - [19, 12344.0] + - - [1024, 4096, 1, 3561, 1024, 1024, 1024, 4096] + - [4, 12344.0] + - - [4096, 1024, 1, 3492, 4096, 4096, 4096, 1024] + - [19, 12345.0] + - - [36548, 1024, 1, 3712, 36548, 36548, 36548, 1024] + - [35, 12722.0] + - - [4096, 2048, 1, 128, 4096, 4096, 4096, 2048] + - [30, 11561.0] + - - [4096, 3072, 1, 128, 4096, 4096, 4096, 3072] + - [4, 11874.0] + - - [768, 3072, 1, 4096, 768, 768, 768, 3072] + - [15, 11543.0] + - - [768, 30522, 1, 1280, 768, 768, 768, 30522] + - [19, 12665.0] + - - [768, 30522, 1, 320, 768, 768, 768, 30522] + - [35, 12437.0] + - - [768, 30522, 1, 640, 768, 768, 768, 30522] + - [19, 12587.0] + - - [256, 512, 36, 98, 256, 256, 256, 512] + - [30, 10958.0] + - - [256, 256, 64, 56, 256, 256, 256, 256] + - [0, 10448.0] + - - [512, 486, 36, 800, 512, 512, 512, 486] + - [31, 11749.0] + - - [512, 512, 36, 1568, 512, 512, 512, 512] + - [35, 12582.0] + - - [256, 384, 36, 4096, 256, 256, 256, 384] + - [35, 12257.0] + - - [128, 256, 64, 32, 128, 128, 128, 256] + - [15, 5301.0] + - - [128, 256, 64, 9, 128, 128, 128, 256] + - [36, 1884.0] + - - [256, 512, 36, 784, 256, 256, 256, 512] + - [39, 11881.0] + - - [256, 324, 36, 32, 256, 256, 256, 324] + - [0, 6017.0] + - - [512, 512, 36, 33, 512, 512, 512, 512] + - [0, 7577.0] + - - [192, 384, 64, 128, 192, 192, 192, 384] + - [15, 8294.0] + - - [512, 512, 64, 72, 512, 512, 512, 512] + - [15, 11689.0] + - - [512, 512, 36, 128, 512, 512, 512, 512] + - [43, 11748.0] + - - [192, 384, 64, 2304, 192, 192, 192, 384] + - [4, 9076.0] + - - [384, 256, 64, 450, 384, 384, 384, 256] + - [0, 11798.0] + - - [384, 256, 64, 2304, 384, 384, 384, 256] + - [4, 12132.0] + - - [512, 512, 64, 144, 512, 512, 512, 512] + - [36, 11968.0] + - - [256, 256, 36, 6272, 256, 256, 256, 256] + - [18, 11550.0] + - - [256, 384, 64, 2304, 256, 256, 256, 384] + - [19, 12122.0] + - - [512, 512, 36, 66, 512, 512, 512, 512] + - [23, 10832.0] + - - [128, 256, 64, 800, 128, 128, 128, 256] + - [3, 11435.0] + - - [192, 256, 36, 512, 192, 192, 192, 256] + - [30, 8410.0] + - - [256, 512, 64, 200, 256, 256, 256, 512] + - [31, 11879.0] + - - [256, 512, 64, 25, 256, 256, 256, 512] + - [15, 5575.0] + - - [128, 256, 36, 1568, 128, 128, 128, 256] + - [36, 10831.0] + - - [128, 256, 64, 288, 128, 128, 128, 256] + - [15, 11239.0] + - - [256, 384, 64, 1152, 256, 256, 256, 384] + - [4, 12054.0] + - - [160, 320, 64, 288, 160, 160, 160, 320] + - [15, 7179.0] + - - [128, 256, 36, 128, 128, 128, 128, 256] + - [5, 9497.0] + - - [512, 512, 36, 16, 512, 512, 512, 512] + - [29, 4108.0] + - - [384, 256, 36, 800, 384, 384, 384, 256] + - [20, 11992.0] + - - [192, 384, 36, 4096, 192, 192, 192, 384] + - [19, 9168.0] + - - [256, 384, 64, 576, 256, 256, 256, 384] + - [0, 11932.0] + - - [512, 512, 64, 14, 512, 512, 512, 512] + - [0, 3873.0] + - - [512, 512, 36, 8, 512, 512, 512, 512] + - [8, 1943.0] + - - [512, 486, 64, 128, 512, 512, 512, 486] + - [0, 11185.0] + - - [256, 256, 36, 128, 256, 256, 256, 256] + - [15, 10732.0] + - - [256, 256, 36, 32, 256, 256, 256, 256] + - [15, 6826.0] + - - [192, 256, 64, 288, 192, 192, 192, 256] + - [5, 8710.0] + - - [256, 256, 36, 16, 256, 256, 256, 256] + - [0, 3637.0] + - - [128, 256, 36, 3200, 128, 128, 128, 256] + - [20, 10887.0] + - - [160, 320, 64, 512, 160, 160, 160, 320] + - [18, 7369.0] + - - [160, 320, 36, 512, 160, 160, 160, 320] + - [3, 7213.0] + - - [256, 512, 36, 4, 256, 256, 256, 512] + - [12, 1008.0] + - - [256, 324, 64, 1568, 256, 256, 256, 324] + - [19, 10171.0] + - - [256, 256, 36, 3200, 256, 256, 256, 256] + - [3, 11513.0] + - - [256, 256, 36, 210, 256, 256, 256, 256] + - [15, 10884.0] + - - [192, 384, 64, 576, 192, 192, 192, 384] + - [30, 8883.0] + - - [512, 512, 64, 800, 512, 512, 512, 512] + - [4, 12511.0] + - - [256, 256, 64, 1152, 256, 256, 256, 256] + - [4, 12168.0] + - - [512, 486, 64, 512, 512, 512, 512, 486] + - [16, 11684.0] + - - [256, 512, 64, 1600, 256, 256, 256, 512] + - [19, 12455.0] + - - [512, 512, 64, 9, 512, 512, 512, 512] + - [0, 2479.0] + - - [256, 512, 36, 1568, 256, 256, 256, 512] + - [4, 12013.0] + - - [128, 256, 64, 3200, 128, 128, 128, 256] + - [3, 11714.0] + - - [256, 512, 64, 4, 256, 256, 256, 512] + - [0, 1078.0] + - - [256, 256, 64, 450, 256, 256, 256, 256] + - [9, 11884.0] + - - [256, 256, 64, 72, 256, 256, 256, 256] + - [30, 10413.0] + - - [128, 256, 36, 3136, 128, 128, 128, 256] + - [43, 10862.0] + - - [160, 320, 64, 242, 160, 160, 160, 320] + - [15, 7131.0] + - - [512, 512, 36, 512, 512, 512, 512, 512] + - [4, 12391.0] + - - [512, 512, 36, 256, 512, 512, 512, 512] + - [40, 12157.0] + - - [512, 512, 36, 1024, 512, 512, 512, 512] + - [4, 12545.0] + - - [256, 256, 36, 4096, 256, 256, 256, 256] + - [34, 11527.0] + - - [256, 256, 64, 896, 256, 256, 256, 256] + - [4, 12115.0] + - - [128, 256, 64, 242, 128, 128, 128, 256] + - [30, 11023.0] + - - [192, 384, 36, 1024, 192, 192, 192, 384] + - [5, 9016.0] + - - [128, 256, 64, 100, 128, 128, 128, 256] + - [30, 9763.0] + - - [384, 256, 64, 1152, 384, 384, 384, 256] + - [35, 12059.0] + - - [192, 384, 36, 128, 192, 192, 192, 384] + - [30, 7960.0] + - - [128, 256, 64, 1568, 128, 128, 128, 256] + - [3, 11558.0] + - - [128, 256, 64, 72, 128, 128, 128, 256] + - [23, 9085.0] + - - [256, 256, 36, 12544, 256, 256, 256, 256] + - [3, 11558.0] + - - [256, 256, 36, 105, 256, 256, 256, 256] + - [30, 10120.0] + - - [128, 256, 36, 392, 128, 128, 128, 256] + - [5, 10364.0] + - - [384, 256, 36, 1024, 384, 384, 384, 256] + - [4, 12073.0] + - - [128, 256, 64, 1152, 128, 128, 128, 256] + - [3, 11487.0] + - - [256, 324, 64, 32, 256, 256, 256, 324] + - [0, 7031.0] + - - [256, 384, 36, 800, 256, 256, 256, 384] + - [19, 12047.0] + - - [512, 512, 64, 4, 512, 512, 512, 512] + - [28, 1214.0] + - - [192, 320, 36, 128, 192, 192, 192, 320] + - [30, 7685.0] + - - [192, 384, 64, 242, 192, 192, 192, 384] + - [15, 8690.0] + - - [256, 486, 64, 32, 256, 256, 256, 486] + - [0, 7145.0] + - - [512, 512, 64, 64, 512, 512, 512, 512] + - [0, 11585.0] + - - [128, 256, 36, 512, 128, 128, 128, 256] + - [5, 10578.0] + - - [512, 512, 64, 576, 512, 512, 512, 512] + - [4, 12457.0] + - - [256, 256, 64, 9, 256, 256, 256, 256] + - [4, 2582.0] + - - [128, 256, 36, 12544, 128, 128, 128, 256] + - [37, 11044.0] + - - [256, 512, 36, 3136, 256, 256, 256, 512] + - [35, 12083.0] + - - [144, 288, 36, 512, 144, 144, 144, 288] + - [0, 5894.0] + - - [384, 384, 36, 800, 384, 384, 384, 384] + - [4, 11854.0] + - - [512, 512, 64, 1600, 512, 512, 512, 512] + - [19, 12565.0] + - - [512, 512, 36, 4, 512, 512, 512, 512] + - [0, 1042.0] + - - [192, 384, 64, 450, 192, 192, 192, 384] + - [15, 8830.0] + - - [256, 256, 36, 1024, 256, 256, 256, 256] + - [34, 11367.0] + - - [256, 512, 64, 400, 256, 256, 256, 512] + - [4, 12184.0] + - - [128, 256, 36, 6272, 128, 128, 128, 256] + - [37, 10979.0] + - - [256, 256, 36, 512, 256, 256, 256, 256] + - [30, 11256.0] + - - [256, 256, 64, 112, 256, 256, 256, 256] + - [15, 11276.0] + - - [512, 512, 64, 18, 512, 512, 512, 512] + - [1, 4402.0] + - - [256, 256, 64, 18, 256, 256, 256, 256] + - [28, 4007.0] + - - [256, 256, 64, 1568, 256, 256, 256, 256] + - [35, 12223.0] + - - [384, 256, 36, 4096, 384, 384, 384, 256] + - [4, 12263.0] + - - [256, 512, 64, 800, 256, 256, 256, 512] + - [4, 12365.0] + - - [256, 384, 36, 2048, 256, 256, 256, 384] + - [35, 12181.0] + - - [384, 384, 64, 2304, 384, 384, 384, 384] + - [4, 12607.0] + - - [160, 320, 64, 128, 160, 160, 160, 320] + - [15, 6903.0] + - - [512, 512, 36, 528, 512, 512, 512, 512] + - [35, 12390.0] + - - [160, 320, 36, 128, 160, 160, 160, 320] + - [39, 6510.0] + - - [256, 512, 36, 49, 256, 256, 256, 512] + - [39, 9132.0] + - - [384, 384, 64, 450, 384, 384, 384, 384] + - [4, 12292.0] + - - [256, 256, 64, 3200, 256, 256, 256, 256] + - [4, 12337.0] + - - [512, 512, 64, 8, 512, 512, 512, 512] + - [0, 2150.0] + - - [512, 512, 64, 288, 512, 512, 512, 512] + - [4, 12289.0] + - - [384, 384, 36, 1024, 384, 384, 384, 384] + - [4, 11904.0] + - - [128, 256, 36, 16, 128, 128, 128, 256] + - [8, 2477.0] + - - [256, 256, 64, 288, 256, 256, 256, 256] + - [1, 11852.0] + - - [256, 384, 36, 1024, 256, 256, 256, 384] + - [35, 12034.0] + - - [256, 324, 36, 3200, 256, 256, 256, 324] + - [4, 10285.0] + - - [192, 384, 64, 512, 192, 192, 192, 384] + - [1, 8859.0] + - - [128, 256, 64, 1600, 128, 128, 128, 256] + - [34, 11563.0] + - - [512, 512, 36, 32, 512, 512, 512, 512] + - [0, 8229.0] + - - [512, 512, 36, 3136, 512, 512, 512, 512] + - [19, 12624.0] + - - [128, 256, 64, 6400, 128, 128, 128, 256] + - [3, 11727.0] + - - [256, 256, 36, 2048, 256, 256, 256, 256] + - [3, 11449.0] + - - [256, 256, 64, 6400, 256, 256, 256, 256] + - [4, 12372.0] + - - [256, 256, 36, 1680, 256, 256, 256, 256] + - [3, 11421.0] + - - [192, 384, 36, 2048, 192, 192, 192, 384] + - [4, 9094.0] + - - [256, 256, 64, 144, 256, 256, 256, 256] + - [20, 11281.0] + - - [384, 384, 36, 4096, 384, 384, 384, 384] + - [19, 12047.0] + - - [160, 320, 64, 1152, 160, 160, 160, 320] + - [0, 7510.0] + - - [384, 256, 36, 2048, 384, 384, 384, 256] + - [4, 12184.0] + - - [256, 512, 36, 392, 256, 256, 256, 512] + - [15, 11758.0] + - - [256, 512, 64, 50, 256, 256, 256, 512] + - [32, 9763.0] + - - [384, 384, 36, 2048, 384, 384, 384, 384] + - [4, 11999.0] + - - [256, 384, 64, 450, 256, 256, 256, 384] + - [1, 11810.0] + - - [192, 320, 64, 128, 192, 192, 192, 320] + - [8, 8163.0] + - - [128, 256, 36, 32, 128, 128, 128, 256] + - [25, 5668.0] + - - [512, 512, 64, 256, 512, 512, 512, 512] + - [4, 12230.0] + - - [256, 512, 64, 32, 256, 256, 256, 512] + - [0, 7481.0] + - - [384, 384, 64, 576, 384, 384, 384, 384] + - [35, 12404.0] + - - [512, 486, 36, 288, 512, 512, 512, 486] + - [16, 11307.0] + - - [144, 288, 64, 242, 144, 144, 144, 288] + - [0, 5787.0] + - - [384, 256, 64, 576, 384, 384, 384, 256] + - [35, 11901.0] + - - [512, 512, 36, 64, 512, 512, 512, 512] + - [0, 11050.0] + - - [448, 384, 64, 128, 448, 448, 448, 384] + - [23, 10175.0] + - - [144, 288, 64, 288, 144, 144, 144, 288] + - [0, 5832.0] + - - [512, 512, 64, 224, 512, 512, 512, 512] + - [1, 12167.0] + - - [384, 384, 64, 1152, 384, 384, 384, 384] + - [19, 12547.0] + - - [448, 384, 36, 128, 448, 448, 448, 384] + - [15, 9872.0] + - - [256, 486, 36, 128, 256, 256, 256, 486] + - [8, 10424.0] + - - [256, 256, 36, 800, 256, 256, 256, 256] + - [30, 11381.0] + - - [192, 384, 36, 800, 192, 192, 192, 384] + - [20, 8976.0] + - - [256, 256, 36, 256, 256, 256, 256, 256] + - [15, 11139.0] + - - [192, 384, 64, 1152, 192, 192, 192, 384] + - [4, 8998.0] + - - [128, 256, 64, 200, 128, 128, 128, 256] + - [30, 10940.0] + - - [512, 512, 64, 28, 512, 512, 512, 512] + - [1, 6496.0] + - - [144, 288, 64, 1152, 144, 144, 144, 288] + - [3, 6081.0] + - - [256, 256, 64, 576, 256, 256, 256, 256] + - [36, 11992.0] + - - [256, 256, 64, 2304, 256, 256, 256, 256] + - [4, 12298.0] + - - [192, 384, 36, 512, 192, 192, 192, 384] + - [36, 8830.0] + - - [256, 512, 36, 32, 256, 256, 256, 512] + - [15, 6814.0] + - - [512, 512, 64, 128, 512, 512, 512, 512] + - [36, 11901.0] + - - [512, 512, 64, 32, 512, 512, 512, 512] + - [0, 8676.0] + - - [128, 256, 36, 196, 128, 128, 128, 256] + - [2, 9578.0] + - - [196, 528, 32, 32, 196, 196, 196, 528] + - [0, 3925.0] + - - [196, 512, 32, 24, 196, 196, 196, 512] + - [39, 3296.0] + - - [1225, 192, 32, 32, 1225, 1225, 1225, 192] + - [23, 7943.0] + - - [1001, 1536, 1, 32, 1001, 1001, 1001, 1536] + - [23, 5336.0] + - - [196, 480, 32, 64, 196, 196, 196, 480] + - [8, 5914.0] + - - [289, 1024, 32, 384, 289, 289, 289, 1024] + - [1, 9171.0] + - - [784, 192, 32, 96, 784, 784, 784, 192] + - [39, 9814.0] + - - [50176, 256, 1, 128, 50176, 50176, 50176, 256] + - [43, 11858.0] + - - [289, 1024, 32, 256, 289, 289, 289, 1024] + - [20, 9024.0] + - - [289, 1024, 32, 192, 289, 289, 289, 1024] + - [16, 8912.0] + - - [12544, 512, 1, 256, 12544, 12544, 12544, 512] + - [31, 11932.0] + - - [1225, 1728, 1, 192, 1225, 1225, 1225, 1728] + - [15, 10294.0] + - - [196, 480, 32, 96, 196, 196, 196, 480] + - [8, 7066.0] + - - [196, 512, 32, 144, 196, 196, 196, 512] + - [23, 8170.0] + - - [289, 768, 32, 128, 289, 289, 289, 768] + - [30, 8627.0] + - - [5329, 576, 1, 96, 5329, 5329, 5329, 576] + - [30, 10607.0] + - - [196, 528, 32, 128, 196, 196, 196, 528] + - [23, 7494.0] + - - [5329, 448, 1, 64, 5329, 5329, 5329, 448] + - [0, 9385.0] + - - [784, 256, 32, 64, 784, 784, 784, 256] + - [39, 9708.0] + - - [784, 192, 32, 32, 784, 784, 784, 192] + - [15, 7762.0] + - - [21609, 288, 1, 32, 21609, 21609, 21609, 288] + - [39, 7425.0] + - - [784, 256, 32, 32, 784, 784, 784, 256] + - [15, 8143.0] + - - [5041, 720, 1, 192, 5041, 5041, 5041, 720] + - [0, 10565.0] + - - [196, 512, 32, 128, 196, 196, 196, 512] + - [23, 8022.0] + - - [289, 768, 32, 160, 289, 289, 289, 768] + - [15, 8807.0] + - - [1001, 4096, 1, 512, 1001, 1001, 1001, 4096] + - [1, 11691.0] + - - [1225, 192, 32, 64, 1225, 1225, 1225, 192] + - [15, 10337.0] + - - [784, 192, 32, 16, 784, 784, 784, 192] + - [23, 4940.0] + - - [3136, 1024, 1, 2048, 3136, 3136, 3136, 1024] + - [1, 12144.0] + - - [784, 256, 32, 128, 784, 784, 784, 256] + - [36, 10097.0] + - - [196, 512, 32, 32, 196, 196, 196, 512] + - [0, 4127.0] + - - [1225, 384, 32, 96, 1225, 1225, 1225, 384] + - [8, 11036.0] + - - [5041, 576, 1, 96, 5041, 5041, 5041, 576] + - [30, 10705.0] + - - [5329, 160, 32, 64, 5329, 5329, 5329, 160] + - [0, 8122.0] + - - [1225, 288, 32, 48, 1225, 1225, 1225, 288] + - [30, 8193.0] + - - [4096, 9216, 1, 512, 4096, 4096, 4096, 9216] + - [4, 12637.0] + - - [196, 480, 32, 192, 196, 196, 196, 480] + - [15, 7971.0] + - - [3136, 1024, 1, 512, 3136, 3136, 3136, 1024] + - [5, 11689.0] + - - [784, 192, 32, 64, 784, 784, 784, 192] + - [0, 9474.0] + - - [289, 1024, 32, 128, 289, 289, 289, 1024] + - [0, 8738.0] + - - [289, 768, 32, 192, 289, 289, 289, 768] + - [36, 8930.0] + - - [196, 512, 32, 112, 196, 196, 196, 512] + - [30, 7975.0] + - - [1001, 2048, 1, 32, 1001, 1001, 1001, 2048] + - [23, 5953.0] + - - [1225, 288, 32, 64, 1225, 1225, 1225, 288] + - [8, 9132.0] + - - [1225, 384, 32, 192, 1225, 1225, 1225, 384] + - [1, 11452.0] + - - [50176, 256, 1, 512, 50176, 50176, 50176, 256] + - [4, 12309.0] + - - [196, 512, 32, 160, 196, 196, 196, 512] + - [39, 8303.0] + - - [4096, 4096, 1, 512, 4096, 4096, 4096, 4096] + - [19, 12469.0] + - - [1225, 256, 32, 64, 1225, 1225, 1225, 256] + - [30, 10518.0] + - - [196, 480, 32, 16, 196, 196, 196, 480] + - [0, 2234.0] + - - [1225, 256, 32, 48, 1225, 1225, 1225, 256] + - [15, 9887.0] + - - [1225, 1200, 1, 64, 1225, 1225, 1225, 1200] + - [30, 7408.0] + - - [1225, 384, 32, 64, 1225, 1225, 1225, 384] + - [8, 10704.0] + - - [12544, 512, 1, 1024, 12544, 12544, 12544, 512] + - [16, 12294.0] + - - [196, 512, 32, 64, 196, 196, 196, 512] + - [39, 6266.0] + - - [196, 528, 32, 256, 196, 196, 196, 528] + - [30, 7923.0] + - - [196, 528, 32, 160, 196, 196, 196, 528] + - [0, 7785.0] + - - [1225, 192, 32, 48, 1225, 1225, 1225, 192] + - [15, 9578.0] + - - [1001, 2048, 1, 64, 1001, 1001, 1001, 2048] + - [15, 8655.0] + - - [289, 768, 128, 128, 289, 289, 289, 768] + - [0, 8948.0] + - - [1225, 192, 128, 64, 1225, 1225, 1225, 192] + - [0, 7620.0] + - - [1225, 288, 128, 48, 1225, 1225, 1225, 288] + - [0, 5358.0] + - - [289, 768, 128, 192, 289, 289, 289, 768] + - [1, 9120.0] + - - [289, 768, 128, 160, 289, 289, 289, 768] + - [30, 9018.0] + - - [1225, 256, 128, 48, 1225, 1225, 1225, 256] + - [0, 7124.0] + - - [1225, 192, 128, 48, 1225, 1225, 1225, 192] + - [12, 5813.0] + - - [1225, 288, 128, 64, 1225, 1225, 1225, 288] + - [3, 7009.0] + - - [1225, 256, 128, 64, 1225, 1225, 1225, 256] + - [3, 6290.0] + - - [1001, 2048, 1, 128, 1001, 1001, 1001, 2048] + - [39, 9872.0] + - - [1225, 192, 128, 32, 1225, 1225, 1225, 192] + - [0, 5560.0] + - - [1001, 1536, 1, 64, 1001, 1001, 1001, 1536] + - [15, 7388.0] + - - [1024, 4096, 1, 64, 1024, 1024, 1024, 4096] + - [8, 10627.0] + - - [1024, 4096, 1, 6336, 1024, 1024, 1024, 4096] + - [4, 12382.0] + - - [512, 33708, 1, 3780, 512, 512, 512, 33708] + - [4, 12688.0] + - - [512, 33708, 1, 3968, 512, 512, 512, 33708] + - [4, 12680.0] + - - [512, 33708, 1, 4030, 512, 512, 512, 33708] + - [4, 12685.0] + - - [196, 256, 64, 1024, 196, 196, 196, 256] + - [36, 9218.0] + - - [196, 1024, 64, 256, 196, 196, 196, 1024] + - [30, 9127.0] + - - [289, 768, 64, 128, 289, 289, 289, 768] + - [0, 8846.0] + - - [289, 768, 64, 160, 289, 289, 289, 768] + - [0, 8940.0] + - - [289, 768, 64, 192, 289, 289, 289, 768] + - [5, 9017.0] + - - [784, 128, 64, 512, 784, 784, 784, 128] + - [20, 10468.0] + - - [784, 512, 64, 128, 784, 784, 784, 512] + - [5, 10515.0] + - - [1225, 192, 64, 32, 1225, 1225, 1225, 192] + - [30, 8156.0] + - - [1225, 192, 64, 48, 1225, 1225, 1225, 192] + - [30, 10088.0] + - - [1225, 192, 64, 64, 1225, 1225, 1225, 192] + - [30, 10723.0] + - - [1225, 256, 64, 48, 1225, 1225, 1225, 256] + - [0, 10096.0] + - - [1225, 256, 64, 64, 1225, 1225, 1225, 256] + - [0, 10546.0] + - - [1225, 288, 64, 48, 1225, 1225, 1225, 288] + - [0, 8170.0] + - - [1225, 288, 64, 64, 1225, 1225, 1225, 288] + - [0, 8943.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 256] + - [0, 9240.0] + - - [256, 44505, 1, 8976, 256, 256, 256, 44505] + - [4, 12654.0] + - - [512, 33708, 1, 3796, 512, 512, 512, 33708] + - [4, 12691.0] + - - [512, 33708, 1, 3822, 512, 512, 512, 33708] + - [19, 12691.0] + - - [512, 33708, 1, 3840, 512, 512, 512, 33708] + - [35, 12687.0] + - - [512, 33708, 1, 3859, 512, 512, 512, 33708] + - [35, 12690.0] + - - [512, 33708, 1, 3870, 512, 512, 512, 33708] + - [19, 12689.0] + - - [512, 33708, 1, 3876, 512, 512, 512, 33708] + - [4, 12689.0] + - - [512, 33708, 1, 3906, 512, 512, 512, 33708] + - [4, 12691.0] + - - [512, 33708, 1, 3910, 512, 512, 512, 33708] + - [4, 12691.0] + - - [512, 33708, 1, 3925, 512, 512, 512, 33708] + - [35, 12687.0] + - - [512, 33708, 1, 3942, 512, 512, 512, 33708] + - [35, 12693.0] + - - [512, 33708, 1, 3944, 512, 512, 512, 33708] + - [4, 12693.0] + - - [512, 33708, 1, 3955, 512, 512, 512, 33708] + - [4, 12693.0] + - - [512, 33708, 1, 3969, 512, 512, 512, 33708] + - [19, 12696.0] + - - [512, 33708, 1, 3976, 512, 512, 512, 33708] + - [19, 12693.0] + - - [512, 33708, 1, 3977, 512, 512, 512, 33708] + - [35, 12691.0] + - - [512, 33708, 1, 3978, 512, 512, 512, 33708] + - [4, 12691.0] + - - [512, 33708, 1, 3990, 512, 512, 512, 33708] + - [4, 12694.0] + - - [512, 33708, 1, 3995, 512, 512, 512, 33708] + - [4, 12693.0] + - - [512, 33708, 1, 3996, 512, 512, 512, 33708] + - [4, 12691.0] + - - [512, 33708, 1, 3999, 512, 512, 512, 33708] + - [4, 12692.0] + - - [512, 33708, 1, 4005, 512, 512, 512, 33708] + - [19, 12693.0] + - - [512, 33708, 1, 4012, 512, 512, 512, 33708] + - [4, 12692.0] + - - [512, 33708, 1, 4020, 512, 512, 512, 33708] + - [4, 12690.0] + - - [512, 33708, 1, 4026, 512, 512, 512, 33708] + - [35, 12692.0] + - - [512, 33708, 1, 4032, 512, 512, 512, 33708] + - [35, 12691.0] + - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] + - [16, 11925.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [1, 11986.0] + - - [1024, 30522, 1, 20, 1024, 1024, 1024, 30522] + - [0, 4965.0] + - - [1024, 30522, 1, 80, 1024, 1024, 1024, 30522] + - [23, 10833.0] + - - [1024, 30522, 1, 120, 1024, 1024, 1024, 30522] + - [8, 11827.0] + - - [1024, 4096, 1, 3840, 1024, 1024, 1024, 4096] + - [4, 12361.0] + - - [1024, 4096, 1, 3968, 1024, 1024, 1024, 4096] + - [4, 12359.0] + - - [1024, 4096, 1, 7200, 1024, 1024, 1024, 4096] + - [4, 12387.0] + - - [1024, 4096, 1, 8160, 1024, 1024, 1024, 4096] + - [4, 12393.0] + - - [1024, 4096, 1, 9520, 1024, 1024, 1024, 4096] + - [19, 12394.0] + - - [1024, 4096, 1, 10200, 1024, 1024, 1024, 4096] + - [35, 12393.0] + - - [1024, 42720, 1, 3968, 1024, 1024, 1024, 42720] + - [19, 12781.0] + - - [1024, 42720, 1, 7200, 1024, 1024, 1024, 42720] + - [4, 12784.0] + - - [1024, 42720, 1, 9520, 1024, 1024, 1024, 42720] + - [19, 12783.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 4096, 1024] + - [4, 12354.0] + - - [4096, 1024, 1, 3968, 4096, 4096, 4096, 1024] + - [4, 12359.0] + - - [4096, 1024, 1, 7200, 4096, 4096, 4096, 1024] + - [35, 12382.0] + - - [4096, 1024, 1, 8160, 4096, 4096, 4096, 1024] + - [4, 12385.0] + - - [4096, 1024, 1, 9520, 4096, 4096, 4096, 1024] + - [35, 12391.0] + - - [4096, 1024, 1, 10200, 4096, 4096, 4096, 1024] + - [19, 12391.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [19, 12735.0] + - - [7744, 7744, 1, 7744, 7744, 7744, 7744, 7744] + - [19, 12573.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 1152] + - [30, 10143.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 1536] + - [0, 11246.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 1920] + - [30, 11601.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 2304] + - [15, 11748.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 2688] + - [31, 11929.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 3072] + - [19, 12374.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 3456] + - [4, 12370.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 3840] + - [4, 12563.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 4224] + - [35, 12478.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 4608] + - [4, 12570.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 4992] + - [19, 12502.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 5376] + - [35, 12555.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 5760] + - [19, 12597.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 6144] + - [19, 12604.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 6528] + - [19, 12617.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 6912] + - [4, 12678.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 7296] + - [19, 12674.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 7680] + - [35, 12706.0] + - - [1536, 768, 1, 384, 1536, 1536, 1536, 768] + - [20, 10239.0] + - - [1920, 960, 1, 384, 1920, 1920, 1920, 960] + - [15, 10896.0] + - - [2304, 1152, 1, 384, 2304, 2304, 2304, 1152] + - [15, 11225.0] + - - [2688, 1344, 1, 384, 2688, 2688, 2688, 1344] + - [15, 11467.0] + - - [3072, 1536, 1, 384, 3072, 3072, 3072, 1536] + - [30, 11793.0] + - - [3456, 1728, 1, 384, 3456, 3456, 3456, 1728] + - [15, 11810.0] + - - [3840, 1920, 1, 384, 3840, 3840, 3840, 1920] + - [4, 12138.0] + - - [4224, 2112, 1, 384, 4224, 4224, 4224, 2112] + - [30, 11935.0] + - - [4608, 2304, 1, 384, 4608, 4608, 4608, 2304] + - [4, 12287.0] + - - [4992, 2496, 1, 384, 4992, 4992, 4992, 2496] + - [4, 12124.0] + - - [5376, 2688, 1, 384, 5376, 5376, 5376, 2688] + - [4, 12350.0] + - - [5760, 2880, 1, 384, 5760, 5760, 5760, 2880] + - [4, 12234.0] + - - [6144, 3072, 1, 384, 6144, 6144, 6144, 3072] + - [27, 12495.0] + - - [6528, 3264, 1, 384, 6528, 6528, 6528, 3264] + - [4, 12242.0] + - - [6912, 3456, 1, 384, 6912, 6912, 6912, 3456] + - [35, 12601.0] + - - [7296, 3648, 1, 384, 7296, 7296, 7296, 3648] + - [19, 12362.0] + - - [7680, 3840, 1, 384, 7680, 7680, 7680, 3840] + - [4, 12643.0] + - - [768, 1536, 1, 384, 768, 768, 768, 1536] + - [20, 10290.0] + - - [1152, 2304, 1, 384, 1152, 1152, 1152, 2304] + - [15, 11227.0] + - - [1536, 3072, 1, 384, 1536, 1536, 1536, 3072] + - [15, 11815.0] + - - [1920, 3840, 1, 384, 1920, 1920, 1920, 3840] + - [4, 12155.0] + - - [2304, 4608, 1, 384, 2304, 2304, 2304, 4608] + - [19, 12298.0] + - - [2688, 5376, 1, 384, 2688, 2688, 2688, 5376] + - [4, 12364.0] + - - [3072, 6144, 1, 384, 3072, 3072, 3072, 6144] + - [12, 12495.0] + - - [3456, 6912, 1, 384, 3456, 3456, 3456, 6912] + - [19, 12610.0] + - - [3840, 7680, 1, 384, 3840, 3840, 3840, 7680] + - [4, 12644.0] + - - [4224, 8448, 1, 384, 4224, 4224, 4224, 8448] + - [4, 12664.0] + - - [4608, 9216, 1, 384, 4608, 4608, 4608, 9216] + - [35, 12653.0] + - - [4992, 9984, 1, 384, 4992, 4992, 4992, 9984] + - [35, 12651.0] + - - [5376, 10752, 1, 384, 5376, 5376, 5376, 10752] + - [19, 12666.0] + - - [5760, 11520, 1, 384, 5760, 5760, 5760, 11520] + - [35, 12695.0] + - - [6144, 12288, 1, 384, 6144, 6144, 6144, 12288] + - [4, 12694.0] + - - [6528, 13056, 1, 384, 6528, 6528, 6528, 13056] + - [4, 12700.0] + - - [6912, 13824, 1, 384, 6912, 6912, 6912, 13824] + - [4, 12715.0] + - - [7296, 14592, 1, 384, 7296, 7296, 7296, 14592] + - [4, 12736.0] + - - [7680, 15360, 1, 384, 7680, 7680, 7680, 15360] + - [19, 12734.0] + - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] + - [1, 12196.0] + - - [256, 10240, 1, 8976, 256, 256, 256, 10240] + - [37, 12454.0] + - - [256, 10496, 1, 8976, 256, 256, 256, 10496] + - [3, 11645.0] + - - [256, 11008, 1, 8976, 256, 256, 256, 11008] + - [19, 11920.0] + - - [256, 11264, 1, 8976, 256, 256, 256, 11264] + - [35, 12187.0] + - - [256, 11520, 1, 8976, 256, 256, 256, 11520] + - [4, 12429.0] + - - [256, 11776, 1, 8976, 256, 256, 256, 11776] + - [18, 11713.0] + - - [256, 12544, 1, 8976, 256, 256, 256, 12544] + - [37, 12271.0] + - - [256, 12800, 1, 8976, 256, 256, 256, 12800] + - [37, 12514.0] + - - [256, 13312, 1, 8976, 256, 256, 256, 13312] + - [3, 11959.0] + - - [256, 13568, 1, 8976, 256, 256, 256, 13568] + - [4, 12089.0] + - - [256, 14336, 1, 8976, 256, 256, 256, 14336] + - [3, 11814.0] + - - [256, 14848, 1, 8976, 256, 256, 256, 14848] + - [21, 12160.0] + - - [256, 15104, 1, 8976, 256, 256, 256, 15104] + - [37, 12357.0] + - - [256, 15872, 1, 8976, 256, 256, 256, 15872] + - [35, 12021.0] + - - [256, 16128, 1, 8976, 256, 256, 256, 16128] + - [4, 12206.0] + - - [256, 17152, 1, 8976, 256, 256, 256, 17152] + - [3, 12085.0] + - - [256, 17408, 1, 8976, 256, 256, 256, 17408] + - [4, 12251.0] + - - [256, 18688, 1, 8976, 256, 256, 256, 18688] + - [4, 12294.0] + - - [256, 19968, 1, 8976, 256, 256, 256, 19968] + - [4, 12322.0] + - - [256, 20480, 1, 8976, 256, 256, 256, 20480] + - [4, 12623.0] + - - [256, 20992, 1, 8976, 256, 256, 256, 20992] + - [4, 12217.0] + - - [256, 21248, 1, 8976, 256, 256, 256, 21248] + - [4, 12358.0] + - - [256, 22016, 1, 8976, 256, 256, 256, 22016] + - [4, 12119.0] + - - [256, 26112, 1, 8976, 256, 256, 256, 26112] + - [4, 12339.0] + - - [256, 32512, 1, 8976, 256, 256, 256, 32512] + - [4, 12439.0] + - - [256, 33536, 1, 8976, 256, 256, 256, 33536] + - [4, 12364.0] + - - [256, 4864, 1, 8976, 256, 256, 256, 4864] + - [21, 11612.0] + - - [256, 5120, 1, 8976, 256, 256, 256, 5120] + - [21, 12192.0] + - - [256, 5632, 1, 8976, 256, 256, 256, 5632] + - [18, 11523.0] + - - [256, 5888, 1, 8976, 256, 256, 256, 5888] + - [5, 11290.0] + - - [256, 6144, 1, 8976, 256, 256, 256, 6144] + - [20, 11753.0] + - - [256, 7168, 1, 8976, 256, 256, 256, 7168] + - [21, 11568.0] + - - [256, 8192, 1, 8976, 256, 256, 256, 8192] + - [18, 11761.0] + - - [256, 8960, 1, 8976, 256, 256, 256, 8960] + - [4, 12330.0] + - - [256, 9728, 1, 8976, 256, 256, 256, 9728] + - [21, 11865.0] + - - [256, 9984, 1, 8976, 256, 256, 256, 9984] + - [6, 12164.0] + - - [3200, 2048, 1, 1024, 3200, 3200, 3200, 2048] + - [1, 12536.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 4096] + - [19, 12542.0] + - - [512, 3280, 1, 1600, 512, 512, 512, 3280] + - [30, 10963.0] + - - [512, 3280, 1, 200, 512, 512, 512, 3280] + - [39, 9890.0] + - - [768, 2048, 1, 256, 768, 768, 768, 2048] + - [20, 10960.0] + - - [1600, 1024, 1, 960, 1600, 1600, 1600, 1024] + - [0, 10668.0] + - - [2048, 2048, 1, 960, 2048, 2048, 2048, 2048] + - [4, 12219.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] + - [4, 11850.0] + - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] + - [5, 11680.0] + - - [1024, 4096, 1, 2048, 1024, 1024, 1024, 4096] + - [4, 12338.0] + - - [1024, 30528, 1, 2048, 1024, 1024, 1024, 30528] + - [35, 12713.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12367.0] + - - [1024, 30528, 1, 4096, 1024, 1024, 1024, 30528] + - [35, 12737.0] + - - [9216, 128, 1, 128, 9216, 9216, 9216, 128] + - [8, 8966.0] + - - [9600, 128, 1, 128, 9600, 9600, 9600, 128] + - [0, 9770.0] + - - [9984, 128, 1, 128, 9984, 9984, 9984, 128] + - [0, 9974.0] + - - [10368, 128, 1, 128, 10368, 10368, 10368, 128] + - [0, 9458.0] + - - [10752, 128, 1, 128, 10752, 10752, 10752, 128] + - [0, 9743.0] + - - [11136, 128, 1, 128, 11136, 11136, 11136, 128] + - [3, 9664.0] + - - [11520, 128, 1, 128, 11520, 11520, 11520, 128] + - [15, 9841.0] + - - [11904, 128, 1, 128, 11904, 11904, 11904, 128] + - [36, 10053.0] + - - [12288, 128, 1, 128, 12288, 12288, 12288, 128] + - [13, 10117.0] + - - [12672, 128, 1, 128, 12672, 12672, 12672, 128] + - [5, 10507.0] + - - [13056, 128, 1, 128, 13056, 13056, 13056, 128] + - [0, 10090.0] + - - [13440, 128, 1, 128, 13440, 13440, 13440, 128] + - [15, 10367.0] + - - [13824, 128, 1, 128, 13824, 13824, 13824, 128] + - [0, 10457.0] + - - [14208, 128, 1, 128, 14208, 14208, 14208, 128] + - [36, 10237.0] + - - [14592, 128, 1, 128, 14592, 14592, 14592, 128] + - [0, 10332.0] + - - [14976, 128, 1, 128, 14976, 14976, 14976, 128] + - [36, 10724.0] + - - [15360, 128, 1, 128, 15360, 15360, 15360, 128] + - [8, 10691.0] + - - [15744, 128, 1, 128, 15744, 15744, 15744, 128] + - [15, 10477.0] + - - [16128, 128, 1, 128, 16128, 16128, 16128, 128] + - [0, 10612.0] + - - [16512, 128, 1, 128, 16512, 16512, 16512, 128] + - [39, 10735.0] + - - [16896, 128, 1, 128, 16896, 16896, 16896, 128] + - [15, 10502.0] + - - [17280, 128, 1, 128, 17280, 17280, 17280, 128] + - [36, 10765.0] + - - [17664, 128, 1, 128, 17664, 17664, 17664, 128] + - [15, 10783.0] + - - [18048, 128, 1, 128, 18048, 18048, 18048, 128] + - [30, 10591.0] + - - [18432, 128, 1, 128, 18432, 18432, 18432, 128] + - [15, 10679.0] + - - [18816, 128, 1, 128, 18816, 18816, 18816, 128] + - [30, 10878.0] + - - [19200, 128, 1, 128, 19200, 19200, 19200, 128] + - [0, 10976.0] + - - [19584, 128, 1, 128, 19584, 19584, 19584, 128] + - [28, 10825.0] + - - [19968, 128, 1, 128, 19968, 19968, 19968, 128] + - [0, 10811.0] + - - [20352, 128, 1, 128, 20352, 20352, 20352, 128] + - [36, 11182.0] + - - [20736, 128, 1, 128, 20736, 20736, 20736, 128] + - [15, 10751.0] + - - [21120, 128, 1, 128, 21120, 21120, 21120, 128] + - [15, 10937.0] + - - [21504, 128, 1, 128, 21504, 21504, 21504, 128] + - [0, 10976.0] + - - [21888, 128, 1, 128, 21888, 21888, 21888, 128] + - [36, 10880.0] + - - [22272, 128, 1, 128, 22272, 22272, 22272, 128] + - [15, 10873.0] + - - [22656, 128, 1, 128, 22656, 22656, 22656, 128] + - [5, 11154.0] + - - [23040, 128, 1, 128, 23040, 23040, 23040, 128] + - [8, 11129.0] + - - [9216, 128, 1, 256, 9216, 9216, 9216, 128] + - [5, 10127.0] + - - [9600, 128, 1, 256, 9600, 9600, 9600, 128] + - [5, 10606.0] + - - [9984, 128, 1, 256, 9984, 9984, 9984, 128] + - [28, 10804.0] + - - [10368, 128, 1, 256, 10368, 10368, 10368, 128] + - [30, 10057.0] + - - [10752, 128, 1, 256, 10752, 10752, 10752, 128] + - [15, 10381.0] + - - [11136, 128, 1, 256, 11136, 11136, 11136, 128] + - [0, 10670.0] + - - [11520, 128, 1, 256, 11520, 11520, 11520, 128] + - [30, 10885.0] + - - [11904, 128, 1, 256, 11904, 11904, 11904, 128] + - [36, 10681.0] + - - [12288, 128, 1, 256, 12288, 12288, 12288, 128] + - [20, 10883.0] + - - [12672, 128, 1, 256, 12672, 12672, 12672, 128] + - [5, 11278.0] + - - [13056, 128, 1, 256, 13056, 13056, 13056, 128] + - [15, 10394.0] + - - [13440, 128, 1, 256, 13440, 13440, 13440, 128] + - [30, 10815.0] + - - [13824, 128, 1, 256, 13824, 13824, 13824, 128] + - [0, 11038.0] + - - [14208, 128, 1, 256, 14208, 14208, 14208, 128] + - [36, 10792.0] + - - [14592, 128, 1, 256, 14592, 14592, 14592, 128] + - [5, 10992.0] + - - [14976, 128, 1, 256, 14976, 14976, 14976, 128] + - [36, 11318.0] + - - [15360, 128, 1, 256, 15360, 15360, 15360, 128] + - [36, 11465.0] + - - [15744, 128, 1, 256, 15744, 15744, 15744, 128] + - [30, 10852.0] + - - [16128, 128, 1, 256, 16128, 16128, 16128, 128] + - [15, 11093.0] + - - [16512, 128, 1, 256, 16512, 16512, 16512, 128] + - [15, 11291.0] + - - [16896, 128, 1, 256, 16896, 16896, 16896, 128] + - [36, 11038.0] + - - [17280, 128, 1, 256, 17280, 17280, 17280, 128] + - [36, 11293.0] + - - [17664, 128, 1, 256, 17664, 17664, 17664, 128] + - [13, 11421.0] + - - [18048, 128, 1, 256, 18048, 18048, 18048, 128] + - [30, 10919.0] + - - [18432, 128, 1, 256, 18432, 18432, 18432, 128] + - [30, 11086.0] + - - [18816, 128, 1, 256, 18816, 18816, 18816, 128] + - [30, 11280.0] + - - [19200, 128, 1, 256, 19200, 19200, 19200, 128] + - [8, 11422.0] + - - [19584, 128, 1, 256, 19584, 19584, 19584, 128] + - [36, 11290.0] + - - [19968, 128, 1, 256, 19968, 19968, 19968, 128] + - [36, 11439.0] + - - [20352, 128, 1, 256, 20352, 20352, 20352, 128] + - [13, 11692.0] + - - [20736, 128, 1, 256, 20736, 20736, 20736, 128] + - [0, 11041.0] + - - [21120, 128, 1, 256, 21120, 21120, 21120, 128] + - [30, 11238.0] + - - [21504, 128, 1, 256, 21504, 21504, 21504, 128] + - [15, 11395.0] + - - [21888, 128, 1, 256, 21888, 21888, 21888, 128] + - [5, 11295.0] + - - [22272, 128, 1, 256, 22272, 22272, 22272, 128] + - [20, 11410.0] + - - [22656, 128, 1, 256, 22656, 22656, 22656, 128] + - [5, 11651.0] + - - [23040, 128, 1, 256, 23040, 23040, 23040, 128] + - [36, 11730.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 8064] + - [19, 12694.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 8448] + - [4, 12712.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 8832] + - [19, 12688.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 9216] + - [19, 12699.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 9600] + - [4, 12708.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 9984] + - [19, 12711.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 10368] + - [19, 12715.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 10752] + - [19, 12734.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 11136] + - [19, 12730.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 11520] + - [19, 12742.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 11904] + - [4, 12735.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 12288] + - [4, 12741.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 12672] + - [19, 12735.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 13056] + - [35, 12734.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 13440] + - [35, 12740.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 13824] + - [35, 12741.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 14208] + - [35, 12743.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 14592] + - [4, 12749.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 14976] + - [4, 12755.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 15360] + - [19, 12753.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 15744] + - [35, 12753.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 16128] + - [35, 12750.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 16512] + - [19, 12750.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 16896] + - [4, 12748.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 17280] + - [19, 12753.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 17664] + - [19, 12749.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 18048] + - [12, 12751.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 18432] + - [19, 12757.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 18816] + - [35, 12755.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 19200] + - [19, 12754.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 19584] + - [19, 12757.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 19968] + - [19, 12755.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 20352] + - [19, 12760.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 20736] + - [19, 12753.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 21120] + - [35, 12758.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 21504] + - [19, 12756.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 21888] + - [19, 12759.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 22272] + - [35, 12757.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 22656] + - [19, 12760.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 23040] + - [19, 12757.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [30, 10503.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [30, 11453.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [30, 11823.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [4, 12040.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [19, 12171.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [19, 12624.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [4, 12559.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [35, 12766.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [19, 12664.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [19, 12760.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [19, 12663.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [4, 12705.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [19, 12744.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [19, 12753.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [19, 12813.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [19, 12791.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [19, 12835.0] + - - [8064, 4032, 1, 384, 8064, 8064, 8064, 4032] + - [4, 12426.0] + - - [8448, 4224, 1, 384, 8448, 8448, 8448, 4224] + - [19, 12653.0] + - - [8832, 4416, 1, 384, 8832, 8832, 8832, 4416] + - [4, 12456.0] + - - [9216, 4608, 1, 384, 9216, 9216, 9216, 4608] + - [4, 12645.0] + - - [9600, 4800, 1, 384, 9600, 9600, 9600, 4800] + - [4, 12475.0] + - - [9984, 4992, 1, 384, 9984, 9984, 9984, 4992] + - [4, 12643.0] + - - [10368, 5184, 1, 384, 10368, 10368, 10368, 5184] + - [35, 12489.0] + - - [10752, 5376, 1, 384, 10752, 10752, 10752, 5376] + - [35, 12675.0] + - - [11136, 5568, 1, 384, 11136, 11136, 11136, 5568] + - [4, 12519.0] + - - [11520, 5760, 1, 384, 11520, 11520, 11520, 5760] + - [35, 12686.0] + - - [11904, 5952, 1, 384, 11904, 11904, 11904, 5952] + - [19, 12552.0] + - - [12288, 6144, 1, 384, 12288, 12288, 12288, 6144] + - [4, 12695.0] + - - [12672, 6336, 1, 384, 12672, 12672, 12672, 6336] + - [4, 12568.0] + - - [13056, 6528, 1, 384, 13056, 13056, 13056, 6528] + - [19, 12692.0] + - - [13440, 6720, 1, 384, 13440, 13440, 13440, 6720] + - [19, 12576.0] + - - [13824, 6912, 1, 384, 13824, 13824, 13824, 6912] + - [35, 12716.0] + - - [14208, 7104, 1, 384, 14208, 14208, 14208, 7104] + - [4, 12607.0] + - - [14592, 7296, 1, 384, 14592, 14592, 14592, 7296] + - [19, 12731.0] + - - [14976, 7488, 1, 384, 14976, 14976, 14976, 7488] + - [4, 12606.0] + - - [15360, 7680, 1, 384, 15360, 15360, 15360, 7680] + - [4, 12736.0] + - - [15744, 7872, 1, 384, 15744, 15744, 15744, 7872] + - [35, 12614.0] + - - [16128, 8064, 1, 384, 16128, 16128, 16128, 8064] + - [19, 12742.0] + - - [16512, 8256, 1, 384, 16512, 16512, 16512, 8256] + - [4, 12625.0] + - - [16896, 8448, 1, 384, 16896, 16896, 16896, 8448] + - [4, 12731.0] + - - [17280, 8640, 1, 384, 17280, 17280, 17280, 8640] + - [4, 12642.0] + - - [17664, 8832, 1, 384, 17664, 17664, 17664, 8832] + - [4, 12730.0] + - - [18048, 9024, 1, 384, 18048, 18048, 18048, 9024] + - [4, 12643.0] + - - [18432, 9216, 1, 384, 18432, 18432, 18432, 9216] + - [4, 12735.0] + - - [18816, 9408, 1, 384, 18816, 18816, 18816, 9408] + - [4, 12652.0] + - - [19200, 9600, 1, 384, 19200, 19200, 19200, 9600] + - [4, 12738.0] + - - [19584, 9792, 1, 384, 19584, 19584, 19584, 9792] + - [19, 12662.0] + - - [19968, 9984, 1, 384, 19968, 19968, 19968, 9984] + - [35, 12741.0] + - - [20352, 10176, 1, 384, 20352, 20352, 20352, 10176] + - [4, 12663.0] + - - [20736, 10368, 1, 384, 20736, 20736, 20736, 10368] + - [12, 12739.0] + - - [21120, 10560, 1, 384, 21120, 21120, 21120, 10560] + - [4, 12668.0] + - - [21504, 10752, 1, 384, 21504, 21504, 21504, 10752] + - [19, 12747.0] + - - [21888, 10944, 1, 384, 21888, 21888, 21888, 10944] + - [4, 12669.0] + - - [22272, 11136, 1, 384, 22272, 22272, 22272, 11136] + - [19, 12751.0] + - - [22656, 11328, 1, 384, 22656, 22656, 22656, 11328] + - [19, 12678.0] + - - [23040, 11520, 1, 384, 23040, 23040, 23040, 11520] + - [4, 12752.0] + - - [8064, 16128, 1, 384, 8064, 8064, 8064, 16128] + - [19, 12743.0] + - - [8448, 16896, 1, 384, 8448, 8448, 8448, 16896] + - [19, 12735.0] + - - [8832, 17664, 1, 384, 8832, 8832, 8832, 17664] + - [19, 12735.0] + - - [9216, 18432, 1, 384, 9216, 9216, 9216, 18432] + - [19, 12738.0] + - - [9600, 19200, 1, 384, 9600, 9600, 9600, 19200] + - [19, 12745.0] + - - [9984, 19968, 1, 384, 9984, 9984, 9984, 19968] + - [19, 12743.0] + - - [10368, 20736, 1, 384, 10368, 10368, 10368, 20736] + - [35, 12752.0] + - - [10752, 21504, 1, 384, 10752, 10752, 10752, 21504] + - [35, 12746.0] + - - [11136, 22272, 1, 384, 11136, 11136, 11136, 22272] + - [35, 12758.0] + - - [11520, 23040, 1, 384, 11520, 11520, 11520, 23040] + - [4, 12751.0] + - - [11904, 23808, 1, 384, 11904, 11904, 11904, 23808] + - [35, 12759.0] + - - [12288, 24576, 1, 384, 12288, 12288, 12288, 24576] + - [4, 12752.0] + - - [12672, 25344, 1, 384, 12672, 12672, 12672, 25344] + - [4, 12755.0] + - - [13056, 26112, 1, 384, 13056, 13056, 13056, 26112] + - [19, 12754.0] + - - [13440, 26880, 1, 384, 13440, 13440, 13440, 26880] + - [19, 12757.0] + - - [13824, 27648, 1, 384, 13824, 13824, 13824, 27648] + - [4, 12753.0] + - - [14208, 28416, 1, 384, 14208, 14208, 14208, 28416] + - [35, 12762.0] + - - [14592, 29184, 1, 384, 14592, 14592, 14592, 29184] + - [4, 12753.0] + - - [14976, 29952, 1, 384, 14976, 14976, 14976, 29952] + - [35, 12760.0] + - - [15360, 30720, 1, 384, 15360, 15360, 15360, 30720] + - [19, 12756.0] + - - [15744, 31488, 1, 384, 15744, 15744, 15744, 31488] + - [19, 12760.0] + - - [16128, 32256, 1, 384, 16128, 16128, 16128, 32256] + - [4, 12754.0] + - - [16512, 33024, 1, 384, 16512, 16512, 16512, 33024] + - [19, 12763.0] + - - [16896, 33792, 1, 384, 16896, 16896, 16896, 33792] + - [19, 12758.0] + - - [17280, 34560, 1, 384, 17280, 17280, 17280, 34560] + - [1, 12730.0] + - - [17664, 35328, 1, 384, 17664, 17664, 17664, 35328] + - [27, 12230.0] + - - [18048, 36096, 1, 384, 18048, 18048, 18048, 36096] + - [27, 12678.0] + - - [18432, 36864, 1, 384, 18432, 18432, 18432, 36864] + - [14, 12336.0] + - - [18816, 37632, 1, 384, 18816, 18816, 18816, 37632] + - [19, 12290.0] + - - [19200, 38400, 1, 384, 19200, 19200, 19200, 38400] + - [9, 12290.0] + - - [19584, 39168, 1, 384, 19584, 19584, 19584, 39168] + - [6, 12460.0] + - - [19968, 39936, 1, 384, 19968, 19968, 19968, 39936] + - [13, 12399.0] + - - [20352, 40704, 1, 384, 20352, 20352, 20352, 40704] + - [37, 12388.0] + - - [20736, 41472, 1, 384, 20736, 20736, 20736, 41472] + - [27, 12625.0] + - - [21120, 42240, 1, 384, 21120, 21120, 21120, 42240] + - [16, 12341.0] + - - [21504, 43008, 1, 384, 21504, 21504, 21504, 43008] + - [42, 12370.0] + - - [21888, 43776, 1, 384, 21888, 21888, 21888, 43776] + - [35, 12364.0] + - - [22272, 44544, 1, 384, 22272, 22272, 22272, 44544] + - [42, 12394.0] + - - [22656, 45312, 1, 384, 22656, 22656, 22656, 45312] + - [40, 12361.0] + - - [23040, 46080, 1, 384, 23040, 23040, 23040, 46080] + - [42, 12428.0] + - - [1152, 1536, 1, 384, 1152, 1152, 1152, 1536] + - [30, 10952.0] + - - [1920, 1536, 1, 384, 1920, 1920, 1920, 1536] + - [36, 11556.0] + - - [2304, 1536, 1, 384, 2304, 2304, 2304, 1536] + - [28, 11595.0] + - - [2688, 1536, 1, 384, 2688, 2688, 2688, 1536] + - [16, 11470.0] + - - [3456, 1536, 1, 384, 3456, 3456, 3456, 1536] + - [35, 11475.0] + - - [3840, 1536, 1, 384, 3840, 3840, 3840, 1536] + - [42, 12022.0] + - - [4224, 1536, 1, 384, 4224, 4224, 4224, 1536] + - [36, 11992.0] + - - [4608, 1536, 1, 384, 4608, 4608, 4608, 1536] + - [31, 11974.0] + - - [4992, 1536, 1, 384, 4992, 4992, 4992, 1536] + - [31, 12078.0] + - - [5376, 1536, 1, 384, 5376, 5376, 5376, 1536] + - [4, 12030.0] + - - [5760, 1536, 1, 384, 5760, 5760, 5760, 1536] + - [19, 12401.0] + - - [6144, 1536, 1, 384, 6144, 6144, 6144, 1536] + - [19, 12330.0] + - - [6528, 1536, 1, 384, 6528, 6528, 6528, 1536] + - [1, 12276.0] + - - [6912, 1536, 1, 384, 6912, 6912, 6912, 1536] + - [19, 12237.0] + - - [7296, 1536, 1, 384, 7296, 7296, 7296, 1536] + - [35, 12216.0] + - - [7680, 1536, 1, 384, 7680, 7680, 7680, 1536] + - [19, 12482.0] + - - [8064, 1536, 1, 384, 8064, 8064, 8064, 1536] + - [42, 12455.0] + - - [8448, 1536, 1, 384, 8448, 8448, 8448, 1536] + - [35, 12422.0] + - - [8832, 1536, 1, 384, 8832, 8832, 8832, 1536] + - [19, 12404.0] + - - [9216, 1536, 1, 384, 9216, 9216, 9216, 1536] + - [4, 12304.0] + - - [9600, 1536, 1, 384, 9600, 9600, 9600, 1536] + - [19, 12579.0] + - - [9984, 1536, 1, 384, 9984, 9984, 9984, 1536] + - [4, 12536.0] + - - [10368, 1536, 1, 384, 10368, 10368, 10368, 1536] + - [35, 12485.0] + - - [10752, 1536, 1, 384, 10752, 10752, 10752, 1536] + - [19, 12465.0] + - - [11136, 1536, 1, 384, 11136, 11136, 11136, 1536] + - [4, 12439.0] + - - [11520, 1536, 1, 384, 11520, 11520, 11520, 1536] + - [19, 12603.0] + - - [11904, 1536, 1, 384, 11904, 11904, 11904, 1536] + - [4, 12573.0] + - - [12288, 1536, 1, 384, 12288, 12288, 12288, 1536] + - [27, 12501.0] + - - [12672, 1536, 1, 384, 12672, 12672, 12672, 1536] + - [12, 12487.0] + - - [13056, 1536, 1, 384, 13056, 13056, 13056, 1536] + - [35, 12463.0] + - - [13440, 1536, 1, 384, 13440, 13440, 13440, 1536] + - [35, 12621.0] + - - [13824, 1536, 1, 384, 13824, 13824, 13824, 1536] + - [42, 12562.0] + - - [14208, 1536, 1, 384, 14208, 14208, 14208, 1536] + - [19, 12554.0] + - - [14592, 1536, 1, 384, 14592, 14592, 14592, 1536] + - [35, 12531.0] + - - [14976, 1536, 1, 384, 14976, 14976, 14976, 1536] + - [16, 12487.0] + - - [15360, 1536, 1, 384, 15360, 15360, 15360, 1536] + - [19, 12616.0] + - - [15744, 1536, 1, 384, 15744, 15744, 15744, 1536] + - [35, 12602.0] + - - [16128, 1536, 1, 384, 16128, 16128, 16128, 1536] + - [19, 12564.0] + - - [16512, 1536, 1, 384, 16512, 16512, 16512, 1536] + - [1, 12536.0] + - - [16896, 1536, 1, 384, 16896, 16896, 16896, 1536] + - [4, 12491.0] + - - [17280, 1536, 1, 384, 17280, 17280, 17280, 1536] + - [4, 12616.0] + - - [17664, 1536, 1, 384, 17664, 17664, 17664, 1536] + - [4, 12585.0] + - - [18048, 1536, 1, 384, 18048, 18048, 18048, 1536] + - [42, 12574.0] + - - [18432, 1536, 1, 384, 18432, 18432, 18432, 1536] + - [4, 12568.0] + - - [18816, 1536, 1, 384, 18816, 18816, 18816, 1536] + - [19, 12559.0] + - - [19200, 1536, 1, 384, 19200, 19200, 19200, 1536] + - [35, 12652.0] + - - [19584, 1536, 1, 384, 19584, 19584, 19584, 1536] + - [35, 12632.0] + - - [19968, 1536, 1, 384, 19968, 19968, 19968, 1536] + - [42, 12590.0] + - - [20352, 1536, 1, 384, 20352, 20352, 20352, 1536] + - [35, 12596.0] + - - [20736, 1536, 1, 384, 20736, 20736, 20736, 1536] + - [19, 12575.0] + - - [21120, 1536, 1, 384, 21120, 21120, 21120, 1536] + - [4, 12639.0] + - - [21504, 1536, 1, 384, 21504, 21504, 21504, 1536] + - [4, 12609.0] + - - [21888, 1536, 1, 384, 21888, 21888, 21888, 1536] + - [19, 12591.0] + - - [22272, 1536, 1, 384, 22272, 22272, 22272, 1536] + - [19, 12567.0] + - - [22656, 1536, 1, 384, 22656, 22656, 22656, 1536] + - [35, 12563.0] + - - [23040, 1536, 1, 384, 23040, 23040, 23040, 1536] + - [4, 12633.0] + - - [768, 1920, 1, 384, 768, 768, 768, 1920] + - [3, 10627.0] + - - [1152, 1920, 1, 384, 1152, 1152, 1152, 1920] + - [20, 11398.0] + - - [1536, 1920, 1, 384, 1536, 1536, 1536, 1920] + - [30, 11613.0] + - - [2304, 1920, 1, 384, 2304, 2304, 2304, 1920] + - [31, 11606.0] + - - [2688, 1920, 1, 384, 2688, 2688, 2688, 1920] + - [19, 11777.0] + - - [3072, 1920, 1, 384, 3072, 3072, 3072, 1920] + - [20, 11982.0] + - - [3456, 1920, 1, 384, 3456, 3456, 3456, 1920] + - [30, 11864.0] + - - [4224, 1920, 1, 384, 4224, 4224, 4224, 1920] + - [35, 12263.0] + - - [4608, 1920, 1, 384, 4608, 4608, 4608, 1920] + - [4, 12387.0] + - - [4992, 1920, 1, 384, 4992, 4992, 4992, 1920] + - [4, 12159.0] + - - [5376, 1920, 1, 384, 5376, 5376, 5376, 1920] + - [19, 12275.0] + - - [5760, 1920, 1, 384, 5760, 5760, 5760, 1920] + - [42, 12397.0] + - - [6144, 1920, 1, 384, 6144, 6144, 6144, 1920] + - [4, 12515.0] + - - [6528, 1920, 1, 384, 6528, 6528, 6528, 1920] + - [35, 12337.0] + - - [6912, 1920, 1, 384, 6912, 6912, 6912, 1920] + - [4, 12399.0] + - - [7296, 1920, 1, 384, 7296, 7296, 7296, 1920] + - [35, 12502.0] + - - [7680, 1920, 1, 384, 7680, 7680, 7680, 1920] + - [4, 12572.0] + - - [8064, 1920, 1, 384, 8064, 8064, 8064, 1920] + - [1, 12391.0] + - - [8448, 1920, 1, 384, 8448, 8448, 8448, 1920] + - [4, 12485.0] + - - [8832, 1920, 1, 384, 8832, 8832, 8832, 1920] + - [19, 12562.0] + - - [9216, 1920, 1, 384, 9216, 9216, 9216, 1920] + - [27, 12578.0] + - - [9600, 1920, 1, 384, 9600, 9600, 9600, 1920] + - [35, 12458.0] + - - [9984, 1920, 1, 384, 9984, 9984, 9984, 1920] + - [19, 12488.0] + - - [10368, 1920, 1, 384, 10368, 10368, 10368, 1920] + - [35, 12566.0] + - - [10752, 1920, 1, 384, 10752, 10752, 10752, 1920] + - [42, 12605.0] + - - [11136, 1920, 1, 384, 11136, 11136, 11136, 1920] + - [35, 12500.0] + - - [11520, 1920, 1, 384, 11520, 11520, 11520, 1920] + - [42, 12537.0] + - - [11904, 1920, 1, 384, 11904, 11904, 11904, 1920] + - [4, 12587.0] + - - [12288, 1920, 1, 384, 12288, 12288, 12288, 1920] + - [4, 12606.0] + - - [12672, 1920, 1, 384, 12672, 12672, 12672, 1920] + - [4, 12519.0] + - - [13056, 1920, 1, 384, 13056, 13056, 13056, 1920] + - [4, 12547.0] + - - [13440, 1920, 1, 384, 13440, 13440, 13440, 1920] + - [4, 12604.0] + - - [13824, 1920, 1, 384, 13824, 13824, 13824, 1920] + - [4, 12611.0] + - - [14208, 1920, 1, 384, 14208, 14208, 14208, 1920] + - [4, 12525.0] + - - [14592, 1920, 1, 384, 14592, 14592, 14592, 1920] + - [4, 12546.0] + - - [14976, 1920, 1, 384, 14976, 14976, 14976, 1920] + - [4, 12591.0] + - - [15360, 1920, 1, 384, 15360, 15360, 15360, 1920] + - [4, 12652.0] + - - [15744, 1920, 1, 384, 15744, 15744, 15744, 1920] + - [4, 12554.0] + - - [16128, 1920, 1, 384, 16128, 16128, 16128, 1920] + - [35, 12588.0] + - - [16512, 1920, 1, 384, 16512, 16512, 16512, 1920] + - [35, 12629.0] + - - [16896, 1920, 1, 384, 16896, 16896, 16896, 1920] + - [4, 12629.0] + - - [17280, 1920, 1, 384, 17280, 17280, 17280, 1920] + - [19, 12559.0] + - - [17664, 1920, 1, 384, 17664, 17664, 17664, 1920] + - [4, 12582.0] + - - [18048, 1920, 1, 384, 18048, 18048, 18048, 1920] + - [4, 12617.0] + - - [18432, 1920, 1, 384, 18432, 18432, 18432, 1920] + - [19, 12581.0] + - - [18816, 1920, 1, 384, 18816, 18816, 18816, 1920] + - [27, 12555.0] + - - [19200, 1920, 1, 384, 19200, 19200, 19200, 1920] + - [4, 12600.0] + - - [19584, 1920, 1, 384, 19584, 19584, 19584, 1920] + - [19, 12629.0] + - - [19968, 1920, 1, 384, 19968, 19968, 19968, 1920] + - [19, 12640.0] + - - [20352, 1920, 1, 384, 20352, 20352, 20352, 1920] + - [19, 12583.0] + - - [20736, 1920, 1, 384, 20736, 20736, 20736, 1920] + - [4, 12618.0] + - - [21120, 1920, 1, 384, 21120, 21120, 21120, 1920] + - [19, 12638.0] + - - [21504, 1920, 1, 384, 21504, 21504, 21504, 1920] + - [4, 12657.0] + - - [21888, 1920, 1, 384, 21888, 21888, 21888, 1920] + - [19, 12604.0] + - - [22272, 1920, 1, 384, 22272, 22272, 22272, 1920] + - [4, 12620.0] + - - [22656, 1920, 1, 384, 22656, 22656, 22656, 1920] + - [4, 12647.0] + - - [23040, 1920, 1, 384, 23040, 23040, 23040, 1920] + - [4, 12662.0] + - - [768, 2304, 1, 384, 768, 768, 768, 2304] + - [8, 10931.0] + - - [1536, 2304, 1, 384, 1536, 1536, 1536, 2304] + - [20, 11507.0] + - - [1920, 2304, 1, 384, 1920, 1920, 1920, 2304] + - [34, 11485.0] + - - [2688, 2304, 1, 384, 2688, 2688, 2688, 2304] + - [40, 12070.0] + - - [3072, 2304, 1, 384, 3072, 3072, 3072, 2304] + - [1, 12019.0] + - - [3456, 2304, 1, 384, 3456, 3456, 3456, 2304] + - [1, 12068.0] + - - [3840, 2304, 1, 384, 3840, 3840, 3840, 2304] + - [4, 12386.0] + - - [4224, 2304, 1, 384, 4224, 4224, 4224, 2304] + - [19, 12326.0] + - - [4992, 2304, 1, 384, 4992, 4992, 4992, 2304] + - [4, 12196.0] + - - [5376, 2304, 1, 384, 5376, 5376, 5376, 2304] + - [19, 12426.0] + - - [5760, 2304, 1, 384, 5760, 5760, 5760, 2304] + - [19, 12394.0] + - - [6144, 2304, 1, 384, 6144, 6144, 6144, 2304] + - [4, 12352.0] + - - [6528, 2304, 1, 384, 6528, 6528, 6528, 2304] + - [42, 12514.0] + - - [6912, 2304, 1, 384, 6912, 6912, 6912, 2304] + - [12, 12454.0] + - - [7296, 2304, 1, 384, 7296, 7296, 7296, 2304] + - [4, 12408.0] + - - [7680, 2304, 1, 384, 7680, 7680, 7680, 2304] + - [19, 12566.0] + - - [8064, 2304, 1, 384, 8064, 8064, 8064, 2304] + - [19, 12519.0] + - - [8448, 2304, 1, 384, 8448, 8448, 8448, 2304] + - [42, 12443.0] + - - [8832, 2304, 1, 384, 8832, 8832, 8832, 2304] + - [16, 12421.0] + - - [9216, 2304, 1, 384, 9216, 9216, 9216, 2304] + - [4, 12530.0] + - - [9600, 2304, 1, 384, 9600, 9600, 9600, 2304] + - [19, 12511.0] + - - [9984, 2304, 1, 384, 9984, 9984, 9984, 2304] + - [35, 12477.0] + - - [10368, 2304, 1, 384, 10368, 10368, 10368, 2304] + - [4, 12578.0] + - - [10752, 2304, 1, 384, 10752, 10752, 10752, 2304] + - [4, 12537.0] + - - [11136, 2304, 1, 384, 11136, 11136, 11136, 2304] + - [35, 12505.0] + - - [11520, 2304, 1, 384, 11520, 11520, 11520, 2304] + - [4, 12607.0] + - - [11904, 2304, 1, 384, 11904, 11904, 11904, 2304] + - [4, 12581.0] + - - [12288, 2304, 1, 384, 12288, 12288, 12288, 2304] + - [4, 12549.0] + - - [12672, 2304, 1, 384, 12672, 12672, 12672, 2304] + - [35, 12545.0] + - - [13056, 2304, 1, 384, 13056, 13056, 13056, 2304] + - [4, 12631.0] + - - [13440, 2304, 1, 384, 13440, 13440, 13440, 2304] + - [19, 12607.0] + - - [13824, 2304, 1, 384, 13824, 13824, 13824, 2304] + - [12, 12564.0] + - - [14208, 2304, 1, 384, 14208, 14208, 14208, 2304] + - [4, 12635.0] + - - [14592, 2304, 1, 384, 14592, 14592, 14592, 2304] + - [4, 12598.0] + - - [14976, 2304, 1, 384, 14976, 14976, 14976, 2304] + - [35, 12566.0] + - - [15360, 2304, 1, 384, 15360, 15360, 15360, 2304] + - [27, 12581.0] + - - [15744, 2304, 1, 384, 15744, 15744, 15744, 2304] + - [12, 12612.0] + - - [16128, 2304, 1, 384, 16128, 16128, 16128, 2304] + - [35, 12581.0] + - - [16512, 2304, 1, 384, 16512, 16512, 16512, 2304] + - [35, 12573.0] + - - [16896, 2304, 1, 384, 16896, 16896, 16896, 2304] + - [4, 12638.0] + - - [17280, 2304, 1, 384, 17280, 17280, 17280, 2304] + - [12, 12613.0] + - - [17664, 2304, 1, 384, 17664, 17664, 17664, 2304] + - [4, 12583.0] + - - [18048, 2304, 1, 384, 18048, 18048, 18048, 2304] + - [4, 12651.0] + - - [18432, 2304, 1, 384, 18432, 18432, 18432, 2304] + - [19, 12625.0] + - - [18816, 2304, 1, 384, 18816, 18816, 18816, 2304] + - [19, 12619.0] + - - [19200, 2304, 1, 384, 19200, 19200, 19200, 2304] + - [19, 12667.0] + - - [19584, 2304, 1, 384, 19584, 19584, 19584, 2304] + - [4, 12641.0] + - - [19968, 2304, 1, 384, 19968, 19968, 19968, 2304] + - [19, 12617.0] + - - [20352, 2304, 1, 384, 20352, 20352, 20352, 2304] + - [19, 12617.0] + - - [20736, 2304, 1, 384, 20736, 20736, 20736, 2304] + - [19, 12653.0] + - - [21120, 2304, 1, 384, 21120, 21120, 21120, 2304] + - [12, 12632.0] + - - [21504, 2304, 1, 384, 21504, 21504, 21504, 2304] + - [12, 12614.0] + - - [21888, 2304, 1, 384, 21888, 21888, 21888, 2304] + - [19, 12673.0] + - - [22272, 2304, 1, 384, 22272, 22272, 22272, 2304] + - [33, 12203.0] + - - [22656, 2304, 1, 384, 22656, 22656, 22656, 2304] + - [29, 12336.0] + - - [23040, 2304, 1, 384, 23040, 23040, 23040, 2304] + - [24, 12608.0] + - - [256, 32768, 1, 1, 256, 256, 256, 32768] + - [36, 197.0] + - - [289, 128, 64, 768, 289, 289, 289, 128] + - [36, 8719.0] + - - [289, 160, 64, 768, 289, 289, 289, 160] + - [0, 7355.0] + - - [289, 192, 64, 768, 289, 289, 289, 192] + - [30, 8802.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 256] + - [34, 10563.0] + - - [784, 512, 32, 128, 784, 784, 784, 512] + - [20, 10342.0] + - - [784, 128, 32, 512, 784, 784, 784, 128] + - [0, 10068.0] + - - [196, 1024, 32, 256, 196, 196, 196, 1024] + - [20, 8986.0] + - - [1444, 128, 120, 256, 1444, 1444, 1444, 128] + - [4, 11481.0] + - - [1444, 128, 18, 256, 1444, 1444, 1444, 128] + - [5, 10770.0] + - - [1444, 128, 19, 256, 1444, 1444, 1444, 128] + - [15, 10808.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [1, 11690.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [1, 11122.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [36, 11294.0] + - - [361, 512, 120, 256, 361, 361, 361, 512] + - [5, 11298.0] + - - [361, 512, 18, 256, 361, 361, 361, 512] + - [0, 10507.0] + - - [361, 512, 19, 256, 361, 361, 361, 512] + - [30, 10657.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [35, 12501.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [19, 12373.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [1, 12411.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 7680] + - [42, 12522.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 3840] + - [19, 12206.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 1920] + - [42, 12322.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [35, 12533.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [5, 12301.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [4, 12282.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 4096] + - [4, 11772.0] + - - [1024, 30522, 1, 77, 1024, 1024, 1024, 30522] + - [16, 10581.0] + - - [4096, 1024, 1, 512, 4096, 4096, 4096, 1024] + - [1, 11942.0] + - - [1024, 4096, 1, 1280, 1024, 1024, 1024, 4096] + - [4, 12247.0] + - - [1024, 30522, 1, 200, 1024, 1024, 1024, 30522] + - [24, 12235.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 4096, 1024] + - [1, 12239.0] + - - [1024, 4096, 1, 4992, 1024, 1024, 1024, 4096] + - [15, 11853.0] + - - [1024, 30522, 1, 780, 1024, 1024, 1024, 30522] + - [21, 12317.0] + - - [4096, 1024, 1, 4992, 4096, 4096, 4096, 1024] + - [22, 11889.0] + - - [1024, 30522, 1, 308, 1024, 1024, 1024, 30522] + - [19, 12436.0] + - - [1024, 4096, 1, 5120, 1024, 1024, 1024, 4096] + - [16, 12288.0] + - - [1024, 30522, 1, 800, 1024, 1024, 1024, 30522] + - [21, 11505.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 4096, 1024] + - [21, 11226.0] + - - [1024, 4096, 1, 5248, 1024, 1024, 1024, 4096] + - [28, 12171.0] + - - [1024, 30522, 1, 820, 1024, 1024, 1024, 30522] + - [28, 11471.0] + - - [4096, 1024, 1, 5248, 4096, 4096, 4096, 1024] + - [3, 11891.0] + - - [1024, 4096, 1, 2560, 1024, 1024, 1024, 4096] + - [16, 12299.0] + - - [1024, 30522, 1, 385, 1024, 1024, 1024, 30522] + - [19, 12502.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 1024] + - [1, 12316.0] + - - [1024, 30522, 1, 462, 1024, 1024, 1024, 30522] + - [27, 12465.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 4096] + - [1, 12124.0] + - - [1024, 30522, 1, 160, 1024, 1024, 1024, 30522] + - [20, 12052.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12179.0] + - - [1024, 4096, 1, 1152, 1024, 1024, 1024, 4096] + - [4, 12239.0] + - - [1024, 30522, 1, 180, 1024, 1024, 1024, 30522] + - [19, 12119.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 4096, 1024] + - [1, 12185.0] + - - [1024, 4096, 1, 8192, 1024, 1024, 1024, 4096] + - [38, 12099.0] + - - [1024, 4096, 1, 9600, 1024, 1024, 1024, 4096] + - [9, 12187.0] + - - [1024, 33712, 1, 8192, 1024, 1024, 1024, 33712] + - [4, 12612.0] + - - [1024, 33712, 1, 9600, 1024, 1024, 1024, 33712] + - [12, 12599.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 4096, 1024] + - [19, 12406.0] + - - [4096, 1024, 1, 9600, 4096, 4096, 4096, 1024] + - [35, 12412.0] + - - [1024, 4096, 1, 10064, 1024, 1024, 1024, 4096] + - [19, 12402.0] + - - [1024, 4096, 1, 10080, 1024, 1024, 1024, 4096] + - [4, 12405.0] + - - [1024, 4096, 1, 6528, 1024, 1024, 1024, 4096] + - [35, 12400.0] + - - [1024, 4096, 1, 7104, 1024, 1024, 1024, 4096] + - [19, 12408.0] + - - [1024, 4096, 1, 8064, 1024, 1024, 1024, 4096] + - [35, 12402.0] + - - [1024, 4096, 1, 9216, 1024, 1024, 1024, 4096] + - [4, 12409.0] + - - [1024, 42720, 1, 10080, 1024, 1024, 1024, 42720] + - [35, 12786.0] + - - [1024, 42720, 1, 6528, 1024, 1024, 1024, 42720] + - [4, 12789.0] + - - [1024, 42720, 1, 7104, 1024, 1024, 1024, 42720] + - [19, 12783.0] + - - [4096, 1024, 1, 10064, 4096, 4096, 4096, 1024] + - [35, 12393.0] + - - [4096, 1024, 1, 10080, 4096, 4096, 4096, 1024] + - [19, 12384.0] + - - [4096, 1024, 1, 6528, 4096, 4096, 4096, 1024] + - [19, 12380.0] + - - [4096, 1024, 1, 7104, 4096, 4096, 4096, 1024] + - [19, 12383.0] + - - [4096, 1024, 1, 8064, 4096, 4096, 4096, 1024] + - [19, 12384.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 4096, 1024] + - [42, 12400.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1600] + - [0, 237.0] + - - [2048, 960, 1, 1, 2048, 2048, 2048, 960] + - [3, 243.0] + - - [2048, 2048, 1, 2, 2048, 2048, 2048, 2048] + - [0, 519.0] + - - [2048, 30592, 1, 1024, 2048, 2048, 2048, 30592] + - [19, 12756.0] + - - [2048, 6144, 1, 1024, 2048, 2048, 2048, 6144] + - [19, 12497.0] + - - [2048, 8192, 1, 1024, 2048, 2048, 2048, 8192] + - [19, 12536.0] + - - [8192, 2048, 1, 1024, 8192, 8192, 8192, 2048] + - [4, 12535.0] + - - [1024, 30592, 1, 8192, 1024, 1024, 1024, 30592] + - [35, 12768.0] + - - [1024, 3072, 1, 8192, 1024, 1024, 1024, 3072] + - [6, 12032.0] + - - [1024, 30592, 1, 2048, 1024, 1024, 1024, 30592] + - [19, 12749.0] + - - [1024, 30592, 1, 4096, 1024, 1024, 1024, 30592] + - [19, 12765.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 3072] + - [6, 11970.0] + - - [2560, 1920, 1, 2048, 2560, 2560, 2560, 1920] + - [31, 12540.0] + - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] + - [1, 12613.0] + - - [2560, 2560, 1, 4, 2560, 2560, 2560, 2560] + - [1, 1053.0] + - - [2560, 7680, 1, 2048, 2560, 2560, 2560, 7680] + - [4, 12776.0] + - - [640, 2560, 1, 2048, 640, 640, 640, 2560] + - [5, 12102.0] + - - [1536, 1536, 1, 4096, 1536, 1536, 1536, 1536] + - [30, 11588.0] + - - [1536, 4608, 1, 4096, 1536, 1536, 1536, 4608] + - [19, 12457.0] + - - [1536, 50304, 1, 4096, 1536, 1536, 1536, 50304] + - [19, 12803.0] + - - [1536, 6144, 1, 4096, 1536, 1536, 1536, 6144] + - [19, 12638.0] + - - [6144, 1536, 1, 4096, 6144, 6144, 6144, 1536] + - [19, 12642.0] + - - [1536, 1536, 1, 8192, 1536, 1536, 1536, 1536] + - [3, 11586.0] + - - [1536, 4608, 1, 8192, 1536, 1536, 1536, 4608] + - [35, 12468.0] + - - [1536, 50304, 1, 8192, 1536, 1536, 1536, 50304] + - [42, 12820.0] + - - [1536, 6144, 1, 8192, 1536, 1536, 1536, 6144] + - [19, 12652.0] + - - [6144, 1536, 1, 8192, 6144, 6144, 6144, 1536] + - [4, 12644.0] + - - [1024, 3072, 1, 16384, 1024, 1024, 1024, 3072] + - [6, 12039.0] + - - [1024, 4096, 1, 16384, 1024, 1024, 1024, 4096] + - [35, 12395.0] + - - [1024, 50304, 1, 16384, 1024, 1024, 1024, 50304] + - [19, 12721.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 4096, 1024] + - [12, 12368.0] + - - [1024, 50304, 1, 2048, 1024, 1024, 1024, 50304] + - [35, 12761.0] + - - [1024, 50304, 1, 4096, 1024, 1024, 1024, 50304] + - [35, 12750.0] + - - [1024, 50304, 1, 8192, 1024, 1024, 1024, 50304] + - [19, 12773.0] + - - [1024, 30528, 1, 8192, 1024, 1024, 1024, 30528] + - [35, 12712.0] + - - [256, 6912, 1, 1, 256, 256, 256, 6912] + - [11, 243.0] + - - [30528, 1024, 1, 640, 30528, 30528, 30528, 1024] + - [19, 12606.0] + - - [30528, 1024, 1, 1280, 30528, 30528, 30528, 1024] + - [19, 12682.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 4096, 1024] + - [35, 12397.0] + - - [1024, 4096, 1, 10240, 1024, 1024, 1024, 4096] + - [4, 12389.0] + - - [30528, 1024, 1, 1600, 30528, 30528, 30528, 1024] + - [35, 12701.0] + - - [1024, 4096, 1, 10496, 1024, 1024, 1024, 4096] + - [35, 12386.0] + - - [30528, 1024, 1, 1640, 30528, 30528, 30528, 1024] + - [35, 12700.0] + - - [4096, 1024, 1, 10496, 4096, 4096, 4096, 1024] + - [35, 12389.0] + - - [30528, 1024, 1, 160, 30528, 30528, 30528, 1024] + - [1, 12196.0] + - - [1024, 4096, 1, 6144, 1024, 1024, 1024, 4096] + - [19, 12375.0] + - - [30528, 1024, 1, 240, 30528, 30528, 30528, 1024] + - [9, 12403.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 4096, 1024] + - [4, 12370.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 128] + - [31, 12024.0] + - - [784, 256, 64, 512, 784, 784, 784, 256] + - [1, 10912.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 256] + - [0, 11906.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [4, 12237.0] + - - [196, 512, 64, 1024, 196, 196, 196, 512] + - [4, 9418.0] + - - [784, 512, 64, 256, 784, 784, 784, 512] + - [16, 10844.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [35, 11013.0] + - - [196, 1024, 64, 512, 196, 196, 196, 1024] + - [31, 9412.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [4, 9546.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 128] + - [20, 11939.0] + - - [784, 256, 32, 512, 784, 784, 784, 256] + - [1, 10564.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 256] + - [15, 11776.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [16, 12116.0] + - - [196, 512, 32, 1024, 196, 196, 196, 512] + - [36, 9263.0] + - - [784, 512, 32, 256, 784, 784, 784, 512] + - [1, 10720.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [16, 10920.0] + - - [196, 1024, 32, 512, 196, 196, 196, 1024] + - [1, 9288.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [4, 9428.0] + - - [1024, 4096, 1, 10224, 1024, 1024, 1024, 4096] + - [4, 12384.0] + - - [4096, 1024, 1, 10224, 4096, 4096, 4096, 1024] + - [19, 12386.0] + - - [1024, 3072, 1, 10224, 1024, 1024, 1024, 3072] + - [6, 12041.0] + - - [1024, 3072, 1, 10240, 1024, 1024, 1024, 3072] + - [29, 12015.0] + - - [4096, 1024, 1, 10192, 4096, 4096, 4096, 1024] + - [35, 12393.0] + - - [1024, 3072, 1, 10192, 1024, 1024, 1024, 3072] + - [37, 12030.0] + - - [1024, 4096, 1, 10192, 1024, 1024, 1024, 4096] + - [27, 12362.0] + - - [1024, 3072, 1, 10200, 1024, 1024, 1024, 3072] + - [19, 12008.0] + - - [4096, 1024, 1, 10208, 4096, 4096, 4096, 1024] + - [35, 12384.0] + - - [1024, 3072, 1, 10208, 1024, 1024, 1024, 3072] + - [21, 12030.0] + - - [1024, 4096, 1, 10208, 1024, 1024, 1024, 4096] + - [35, 12379.0] + - - [1024, 2048, 1, 10224, 1024, 1024, 1024, 2048] + - [30, 11794.0] + - - [1024, 2048, 1, 10240, 1024, 1024, 1024, 2048] + - [15, 11785.0] + - - [1024, 2048, 1, 10192, 1024, 1024, 1024, 2048] + - [18, 11772.0] + - - [1024, 3072, 1, 10080, 1024, 1024, 1024, 3072] + - [4, 12009.0] + - - [100352, 256, 1, 512, 100352, 100352, 100352, 256] + - [19, 12564.0] + - - [12544, 1024, 1, 2048, 12544, 12544, 12544, 1024] + - [35, 12494.0] + - - [12544, 147, 1, 64, 12544, 12544, 12544, 147] + - [39, 6869.0] + - - [200704, 256, 1, 512, 200704, 200704, 200704, 256] + - [4, 12705.0] + - - [25088, 512, 1, 1024, 25088, 25088, 25088, 512] + - [19, 12444.0] + - - [3136, 576, 1, 64, 3136, 3136, 3136, 576] + - [0, 8438.0] + - - [50176, 512, 1, 1024, 50176, 50176, 50176, 512] + - [35, 12666.0] + - - [6272, 1024, 1, 2048, 6272, 6272, 6272, 1024] + - [4, 12372.0] + - - [196, 1024, 128, 512, 196, 196, 196, 1024] + - [1, 9544.0] + - - [196, 1024, 256, 512, 196, 196, 196, 1024] + - [31, 9616.0] + - - [3136, 256, 128, 128, 3136, 3136, 3136, 256] + - [19, 12021.0] + - - [3136, 256, 256, 128, 3136, 3136, 3136, 256] + - [19, 12062.0] + - - [784, 512, 128, 256, 784, 784, 784, 512] + - [31, 10923.0] + - - [784, 512, 256, 256, 784, 784, 784, 512] + - [35, 10981.0] + - - [30528, 1024, 1, 2560, 30528, 30528, 30528, 1024] + - [4, 12704.0] + - - [1024, 4096, 1, 12288, 1024, 1024, 1024, 4096] + - [12, 12376.0] + - - [30528, 1024, 1, 1920, 30528, 30528, 30528, 1024] + - [4, 12703.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 4096, 1024] + - [19, 12386.0] + - - [25600, 128, 25, 128, 25600, 25600, 25600, 128] + - [41, 10545.0] + - - [12544, 128, 36, 128, 12544, 12544, 12544, 128] + - [34, 10729.0] + - - [9216, 128, 49, 128, 9216, 9216, 9216, 128] + - [26, 10791.0] + - - [6400, 128, 64, 128, 6400, 6400, 6400, 128] + - [3, 10804.0] + - - [6400, 256, 25, 256, 6400, 6400, 6400, 256] + - [4, 12553.0] + - - [4096, 256, 36, 256, 4096, 4096, 4096, 256] + - [4, 12480.0] + - - [2304, 256, 49, 256, 2304, 2304, 2304, 256] + - [4, 12422.0] + - - [2304, 256, 64, 256, 2304, 2304, 2304, 256] + - [35, 12494.0] + - - [2304, 512, 25, 512, 2304, 2304, 2304, 512] + - [35, 12690.0] + - - [1024, 512, 36, 512, 1024, 1024, 1024, 512] + - [19, 12552.0] + - - [1024, 512, 49, 512, 1024, 1024, 1024, 512] + - [35, 12595.0] + - - [1024, 512, 64, 512, 1024, 1024, 1024, 512] + - [4, 12641.0] + - - [3072, 768, 1, 2048, 3072, 3072, 3072, 768] + - [15, 11492.0] + - - [768, 3072, 1, 2048, 768, 768, 768, 3072] + - [15, 11512.0] + - - [3072, 768, 1, 4608, 3072, 3072, 3072, 768] + - [34, 11544.0] + - - [768, 3072, 1, 4608, 768, 768, 768, 3072] + - [15, 11582.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4096, 1024] + - [19, 12369.0] + - - [1024, 4096, 1, 4608, 1024, 1024, 1024, 4096] + - [4, 12360.0] + - - [4880, 256, 49, 256, 4880, 4880, 4880, 256] + - [19, 12233.0] + - - [3128, 256, 64, 256, 3128, 3128, 3128, 256] + - [9, 12168.0] + - - [4680, 256, 49, 256, 4680, 4680, 4680, 256] + - [19, 12335.0] + - - [5280, 256, 36, 256, 5280, 5280, 5280, 256] + - [4, 12271.0] + - - [2640, 256, 64, 256, 2640, 2640, 2640, 256] + - [35, 12231.0] + - - [5304, 256, 49, 256, 5304, 5304, 5304, 256] + - [35, 12346.0] + - - [4524, 256, 49, 256, 4524, 4524, 4524, 256] + - [35, 12253.0] + - - [2760, 256, 64, 256, 2760, 2760, 2760, 256] + - [4, 12215.0] + - - [6440, 256, 36, 256, 6440, 6440, 6440, 256] + - [4, 12346.0] + - - [5704, 256, 36, 256, 5704, 5704, 5704, 256] + - [35, 12339.0] + - - [2666, 256, 64, 256, 2666, 2666, 2666, 256] + - [4, 12273.0] + - - [2128, 256, 64, 256, 2128, 2128, 2128, 256] + - [4, 12176.0] + - - [1160, 256, 49, 256, 1160, 1160, 1160, 256] + - [36, 11138.0] + - - [4056, 256, 49, 256, 4056, 4056, 4056, 256] + - [4, 12367.0] + - - [6144, 256, 36, 256, 6144, 6144, 6144, 256] + - [4, 12587.0] + - - [950, 2048, 2, 512, 950, 950, 950, 2048] + - [36, 11067.0] + - - [6336, 256, 36, 256, 6336, 6336, 6336, 256] + - [9, 12374.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 512] + - [13, 11884.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 512] + - [0, 11842.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 128] + - [20, 11862.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 128] + - [13, 11586.0] + - - [5632, 256, 36, 256, 5632, 5632, 5632, 256] + - [4, 12562.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 128] + - [5, 11579.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 128] + - [15, 11264.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 1024] + - [1, 11552.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 128] + - [20, 11862.0] + - - [782, 128, 64, 128, 782, 782, 782, 128] + - [15, 9954.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 512] + - [23, 11911.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 128] + - [5, 11913.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 128] + - [15, 11277.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 128] + - [30, 11591.0] + - - [13600, 512, 2, 256, 13600, 13600, 13600, 512] + - [5, 12105.0] + - - [15200, 512, 2, 256, 15200, 15200, 15200, 512] + - [24, 12159.0] + - - [850, 2048, 2, 512, 850, 850, 850, 2048] + - [0, 11009.0] + - - [768, 2048, 2, 512, 768, 768, 768, 2048] + - [20, 11654.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 512] + - [8, 11778.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 128] + - [15, 11193.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 512] + - [30, 11910.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 512] + - [28, 11768.0] + - - [805, 2048, 2, 512, 805, 805, 805, 2048] + - [0, 10394.0] + - - [6912, 256, 36, 256, 6912, 6912, 6912, 256] + - [4, 12575.0] + - - [713, 2048, 2, 512, 713, 713, 713, 2048] + - [5, 10723.0] + - - [13824, 512, 2, 256, 13824, 13824, 13824, 512] + - [31, 12179.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 512] + - [0, 11732.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 512] + - [23, 11897.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 128] + - [15, 11538.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 128] + - [30, 11499.0] + - - [864, 2048, 2, 512, 864, 864, 864, 2048] + - [0, 11126.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 512] + - [8, 11657.0] + - - [672, 2048, 2, 512, 672, 672, 672, 2048] + - [5, 10159.0] + - - [660, 2048, 2, 512, 660, 660, 660, 2048] + - [5, 9955.0] + - - [9408, 128, 2, 512, 9408, 9408, 9408, 128] + - [30, 11282.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 512] + - [36, 11839.0] + - - [726, 2048, 2, 512, 726, 726, 726, 2048] + - [5, 10914.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 512] + - [9, 11701.0] + - - [1240, 256, 49, 256, 1240, 1240, 1240, 256] + - [5, 11841.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 256] + - [0, 11375.0] + - - [888, 2048, 2, 512, 888, 888, 888, 2048] + - [39, 11429.0] + - - [12880, 512, 2, 256, 12880, 12880, 12880, 512] + - [36, 12018.0] + - - [12288, 512, 2, 256, 12288, 12288, 12288, 512] + - [16, 12137.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 128] + - [15, 11817.0] + - - [864, 2048, 2, 256, 864, 864, 864, 2048] + - [30, 10984.0] + - - [12672, 128, 2, 512, 12672, 12672, 12672, 128] + - [28, 11956.0] + - - [11264, 128, 2, 512, 11264, 11264, 11264, 128] + - [20, 11738.0] + - - [11776, 128, 2, 512, 11776, 11776, 11776, 128] + - [30, 11490.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 128] + - [28, 11798.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 1024] + - [1, 11764.0] + - - [14000, 128, 2, 512, 14000, 14000, 14000, 128] + - [36, 11879.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 512] + - [20, 12010.0] + - - [805, 2048, 2, 256, 805, 805, 805, 2048] + - [0, 10249.0] + - - [768, 2048, 2, 256, 768, 768, 768, 2048] + - [36, 11323.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 1024] + - [9, 11708.0] + - - [1251, 256, 49, 256, 1251, 1251, 1251, 256] + - [4, 11809.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 256] + - [5, 11363.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 1024] + - [15, 11235.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 1024] + - [15, 11452.0] + - - [15200, 256, 2, 12, 15200, 15200, 15200, 256] + - [36, 3977.0] + - - [12880, 256, 2, 12, 12880, 12880, 12880, 256] + - [32, 3864.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 1024] + - [36, 11598.0] + - - [13600, 256, 2, 12, 13600, 13600, 13600, 256] + - [32, 3923.0] + - - [15200, 256, 2, 3, 15200, 15200, 15200, 256] + - [15, 1160.0] + - - [12880, 256, 2, 3, 12880, 12880, 12880, 256] + - [15, 1114.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 1024] + - [8, 11837.0] + - - [12288, 256, 2, 12, 12288, 12288, 12288, 256] + - [32, 3013.0] + - - [13824, 256, 2, 12, 13824, 13824, 13824, 256] + - [18, 3560.0] + - - [13600, 256, 2, 3, 13600, 13600, 13600, 256] + - [20, 1148.0] + - - [1900, 1024, 1, 2048, 1900, 1900, 1900, 1024] + - [5, 11972.0] + - - [7600, 512, 1, 256, 7600, 7600, 7600, 512] + - [13, 11662.0] + - - [1610, 1024, 1, 2048, 1610, 1610, 1610, 1024] + - [15, 10785.0] + - - [6144, 512, 1, 256, 6144, 6144, 6144, 512] + - [20, 11323.0] + - - [1900, 1024, 1, 512, 1900, 1900, 1900, 1024] + - [5, 11562.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [19, 12440.0] + - - [3220, 256, 2, 12, 3220, 3220, 3220, 256] + - [5, 2384.0] + - - [3220, 256, 2, 3, 3220, 3220, 3220, 256] + - [7, 695.0] + - - [3800, 256, 2, 3, 3800, 3800, 3800, 256] + - [15, 741.0] + - - [13824, 256, 2, 3, 13824, 13824, 13824, 256] + - [15, 887.0] + - - [12288, 256, 2, 3, 12288, 12288, 12288, 256] + - [9, 805.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 256] + - [15, 10763.0] + - - [3072, 256, 2, 12, 3072, 3072, 3072, 256] + - [15, 2200.0] + - - [3800, 256, 2, 12, 3800, 3800, 3800, 256] + - [36, 2554.0] + - - [3072, 256, 2, 3, 3072, 3072, 3072, 256] + - [22, 708.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 256] + - [36, 11592.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 512] + - [20, 11963.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 256] + - [36, 10963.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 256] + - [36, 10751.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 1024] + - [0, 11534.0] + - - [1024, 256, 1, 33536, 1024, 1024, 1024, 256] + - [45, 10570.0] + - - [1024, 1024, 1, 9520, 1024, 1024, 1024, 1024] + - [46, 11922.0] + - - [1024, 1024, 1, 10200, 1024, 1024, 1024, 1024] + - [50, 11928.0] + - - [1024, 256, 1, 21248, 1024, 1024, 1024, 256] + - [49, 10458.0] + - - [1024, 256, 1, 21504, 1024, 1024, 1024, 256] + - [49, 10465.0] + - - [1024, 256, 1, 22016, 1024, 1024, 1024, 256] + - [54, 10471.0] + - - [1024, 256, 1, 28672, 1024, 1024, 1024, 256] + - [49, 10534.0] + - - [256, 2560, 1, 8976, 256, 256, 256, 2560] + - [51, 11693.0] + - - [256, 2816, 1, 8976, 256, 256, 256, 2816] + - [51, 11564.0] + - - [256, 3328, 1, 8976, 256, 256, 256, 3328] + - [48, 11577.0] + - - [256, 3584, 1, 8976, 256, 256, 256, 3584] + - [53, 11461.0] + - - [256, 3840, 1, 8976, 256, 256, 256, 3840] + - [5, 11995.0] + - - [256, 4096, 1, 8976, 256, 256, 256, 4096] + - [46, 11818.0] + - - [256, 4352, 1, 8976, 256, 256, 256, 4352] + - [46, 11758.0] + - - [1024, 1024, 1, 32768, 1024, 1024, 1024, 1024] + - [46, 12248.0] + - - [1024, 512, 1, 32768, 1024, 1024, 1024, 512] + - [54, 11538.0] + - - [479, 1024, 1, 32768, 479, 479, 479, 1024] + - [53, 10890.0] + - - [512, 256, 1, 55296, 512, 512, 512, 256] + - [52, 9340.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 1024, 1024] + - [44, 11805.0] + - - [1024, 1024, 1, 9600, 1024, 1024, 1024, 1024] + - [46, 11898.0] + - - [1024, 1024, 1, 10064, 1024, 1024, 1024, 1024] + - [46, 11920.0] + - - [1024, 1024, 1, 10080, 1024, 1024, 1024, 1024] + - [46, 11924.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 1024, 1024] + - [44, 11870.0] + - - [480, 1024, 1, 32768, 480, 480, 480, 1024] + - [53, 10909.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 1024, 1024] + - [46, 12097.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 1024, 1024] + - [47, 11904.0] + - - [1024, 1024, 1, 10496, 1024, 1024, 1024, 1024] + - [46, 11934.0] + - - [1024, 1024, 1, 10224, 1024, 1024, 1024, 1024] + - [46, 11930.0] + - - [1024, 1024, 1, 10192, 1024, 1024, 1024, 1024] + - [46, 11914.0] + - - [1024, 1024, 1, 10208, 1024, 1024, 1024, 1024] + - [46, 11909.0] + - - [1024, 1024, 1, 10184, 1024, 1024, 1024, 1024] + - [46, 11918.0] + - - [1024, 1024, 1, 10120, 1024, 1024, 1024, 1024] + - [46, 11924.0] + - - [1024, 1024, 1, 10152, 1024, 1024, 1024, 1024] + - [50, 11953.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 1024, 1024] + - [46, 12018.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] + - [81, 10207.0] + - - [1024, 1024, 1, 200, 1024, 1024, 1024, 1024] + - [59, 9354.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 1024] + - [103, 10722.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] + - [81, 10607.0] + - - [768, 768, 1, 16, 768, 768, 768, 768] + - [99, 2537.0] + - - [768, 768, 1, 320, 768, 768, 768, 768] + - [59, 8634.0] + - - [768, 768, 1, 4096, 768, 768, 768, 768] + - [82, 10346.0] + - - [768, 768, 1, 32, 768, 768, 768, 768] + - [99, 3836.0] + - - [768, 768, 1, 640, 768, 768, 768, 768] + - [82, 9475.0] + - - [768, 768, 1, 64, 768, 768, 768, 768] + - [78, 5347.0] + - - [768, 768, 1, 1280, 768, 768, 768, 768] + - [104, 9979.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] + - [72, 10642.0] + - - [1024, 1024, 1, 120, 1024, 1024, 1024, 1024] + - [59, 8571.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1024, 1024] + - [99, 213.0] + - - [1024, 1024, 1, 20, 1024, 1024, 1024, 1024] + - [99, 3139.0] + - - [1024, 1024, 1, 4, 1024, 1024, 1024, 1024] + - [78, 774.0] + - - [1024, 1024, 1, 6, 1024, 1024, 1024, 1024] + - [57, 1210.0] + - - [1024, 1024, 1, 80, 1024, 1024, 1024, 1024] + - [70, 7269.0] + - - [128, 64, 512, 128, 128, 128, 128, 64] + - [72, 10974.0] + - - [512, 64, 64, 512, 512, 512, 512, 64] + - [103, 11229.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [57, 7643.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 448] + - [108, 9963.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 6784] + - [65, 10505.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 448] + - [57, 8761.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 4288] + - [59, 11118.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 1856] + - [101, 11189.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1024] + - [101, 9465.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 1408] + - [103, 7933.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1856] + - [101, 11163.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [101, 7383.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 128] + - [57, 6340.0] + - - [64, 5056, 1, 256, 64, 64, 64, 5056] + - [79, 6638.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [81, 9041.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1856] + - [81, 9977.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 3584] + - [59, 9645.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [101, 9895.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [58, 5583.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 64] + - [86, 8254.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 256] + - [59, 10358.0] + - - [704, 1024, 1, 128, 704, 704, 704, 1024] + - [57, 7526.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 448] + - [103, 10325.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 704] + - [103, 10116.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 64] + - [103, 7303.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [103, 9016.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 1408] + - [103, 10069.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 256] + - [59, 7805.0] + - - [448, 2944, 1, 128, 448, 448, 448, 2944] + - [57, 9185.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 128] + - [104, 9058.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 128] + - [59, 6440.0] + - - [448, 1408, 1, 256, 448, 448, 448, 1408] + - [57, 8700.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 5056] + - [101, 9372.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 448] + - [59, 6155.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3584] + - [88, 11078.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 64] + - [57, 8961.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 704] + - [81, 8705.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [103, 6457.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 256] + - [103, 8389.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 128] + - [82, 8970.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 64] + - [59, 6406.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 256] + - [101, 9750.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 5888] + - [61, 8083.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1024] + - [103, 9622.0] + - - [448, 1856, 1, 128, 448, 448, 448, 1856] + - [57, 7746.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 704] + - [103, 9777.0] + - - [128, 5888, 1, 256, 128, 128, 128, 5888] + - [103, 9041.0] + - - [704, 704, 1, 3328, 704, 704, 704, 704] + - [60, 8585.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1408] + - [57, 9997.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 256] + - [65, 11052.0] + - - [704, 1856, 1, 128, 704, 704, 704, 1856] + - [79, 9735.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3584] + - [81, 10010.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 2944] + - [59, 8007.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 128] + - [59, 7686.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 1408] + - [79, 10426.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [59, 9556.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 2944] + - [81, 10393.0] + - - [448, 2368, 1, 128, 448, 448, 448, 2368] + - [57, 8841.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 704] + - [59, 9207.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 2944] + - [79, 10603.0] + - - [64, 5888, 1, 256, 64, 64, 64, 5888] + - [101, 6338.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 128] + - [76, 10506.0] + - - [704, 704, 1, 256, 704, 704, 704, 704] + - [57, 7160.0] + - - [448, 704, 1, 1280, 448, 448, 448, 704] + - [79, 8861.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 448] + - [86, 9973.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 704] + - [79, 11122.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1408] + - [57, 10136.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1024] + - [103, 10522.0] + - - [448, 1024, 1, 128, 448, 448, 448, 1024] + - [57, 6104.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 2368] + - [101, 10486.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 64] + - [71, 5436.0] + - - [704, 1024, 1, 256, 704, 704, 704, 1024] + - [101, 8544.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 6784] + - [65, 10212.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [57, 7992.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 4288] + - [81, 10972.0] + - - [256, 1856, 1, 128, 256, 256, 256, 1856] + - [59, 6375.0] + - - [448, 1408, 1, 128, 448, 448, 448, 1408] + - [101, 7300.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 128] + - [103, 9073.0] + - - [704, 448, 1, 256, 704, 704, 704, 448] + - [57, 6640.0] + - - [704, 1408, 1, 128, 704, 704, 704, 1408] + - [101, 8414.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 448] + - [57, 9155.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [58, 5622.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 704] + - [81, 9970.0] + - - [128, 4288, 1, 256, 128, 128, 128, 4288] + - [59, 7737.0] + - - [704, 448, 1, 3328, 704, 704, 704, 448] + - [101, 9266.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 1024] + - [103, 10665.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 2368] + - [101, 10333.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 6784] + - [83, 9170.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 256] + - [103, 10174.0] + - - [256, 2368, 1, 128, 256, 256, 256, 2368] + - [70, 7042.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 704] + - [57, 10473.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 448] + - [106, 10697.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [101, 8398.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 448] + - [82, 9571.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [103, 7705.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] + - [59, 9740.0] + - - [704, 1856, 1, 256, 704, 704, 704, 1856] + - [57, 10519.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 4288] + - [82, 9463.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 2368] + - [81, 9948.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 448] + - [101, 9878.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 448] + - [57, 7701.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 128] + - [57, 8416.0] + - - [64, 6784, 1, 256, 64, 64, 64, 6784] + - [57, 7180.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 5056] + - [79, 8900.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 256] + - [60, 10015.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 448] + - [81, 10437.0] + - - [128, 3584, 1, 256, 128, 128, 128, 3584] + - [103, 7809.0] + - - [704, 448, 1, 1280, 704, 704, 704, 448] + - [57, 8826.0] + - - [128, 5056, 1, 256, 128, 128, 128, 5056] + - [103, 9024.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 256] + - [59, 10784.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 128] + - [82, 9486.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 256] + - [58, 5402.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1408] + - [81, 7631.0] + - - [128, 2368, 1, 256, 128, 128, 128, 2368] + - [94, 6434.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 64] + - [86, 9435.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 2944] + - [59, 8282.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 448] + - [79, 10603.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [103, 10080.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 128] + - [59, 8908.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 448] + - [103, 10736.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 64] + - [57, 6746.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 704] + - [81, 7454.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 5056] + - [60, 11060.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 128] + - [101, 7562.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 448] + - [103, 7380.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 5888] + - [59, 10204.0] + - - [704, 448, 1, 128, 704, 704, 704, 448] + - [102, 5368.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [104, 9479.0] + - - [128, 2944, 1, 256, 128, 128, 128, 2944] + - [103, 6518.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [101, 7940.0] + - - [448, 1856, 1, 256, 448, 448, 448, 1856] + - [79, 8604.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 128] + - [63, 9963.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 448] + - [103, 9576.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 128] + - [59, 10339.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 704] + - [101, 10043.0] + - - [448, 2944, 1, 256, 448, 448, 448, 2944] + - [101, 9884.0] + - - [448, 2368, 1, 256, 448, 448, 448, 2368] + - [101, 9583.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 2368] + - [59, 9070.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 128] + - [60, 10465.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 64] + - [101, 9389.0] + - - [64, 5888, 1, 128, 64, 64, 64, 5888] + - [102, 5583.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 128] + - [60, 11061.0] + - - [448, 704, 1, 256, 448, 448, 448, 704] + - [79, 6596.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 128] + - [63, 8267.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 5056] + - [81, 10580.0] + - - [704, 704, 1, 128, 704, 704, 704, 704] + - [100, 6147.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 6784] + - [57, 8847.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [100, 5146.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [57, 7449.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [81, 10455.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 1024] + - [101, 9772.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 2368] + - [60, 10411.0] + - - [256, 3584, 1, 128, 256, 256, 256, 3584] + - [81, 8486.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 256] + - [101, 8950.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 256] + - [57, 7132.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [57, 8034.0] + - - [256, 2944, 1, 128, 256, 256, 256, 2944] + - [103, 7843.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 256] + - [81, 7954.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 448] + - [79, 9610.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 256] + - [59, 11056.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 704] + - [101, 9701.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [102, 6422.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 448] + - [103, 8814.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 64] + - [103, 9066.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [57, 6168.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [81, 8565.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 448] + - [101, 10537.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1856] + - [82, 9576.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 256] + - [80, 6270.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 128] + - [79, 8822.0] + - - [448, 1024, 1, 256, 448, 448, 448, 1024] + - [79, 7696.0] + - - [64, 6784, 1, 128, 64, 64, 64, 6784] + - [70, 5753.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 64] + - [81, 7915.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [103, 6130.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [81, 6269.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 5888] + - [72, 10335.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 256] + - [103, 7654.0] + - - [64, 5056, 1, 128, 64, 64, 64, 5056] + - [80, 5436.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 64] + - [58, 5494.0] + - - [448, 704, 1, 128, 448, 448, 448, 704] + - [80, 5412.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 704] + - [81, 8504.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 256] + - [60, 10472.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 128] + - [103, 10155.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 3584] + - [88, 10770.0] + - - [256, 1408, 1, 128, 256, 256, 256, 1408] + - [56, 5328.0] + - - [256, 4288, 1, 128, 256, 256, 256, 4288] + - [59, 9042.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [79, 7560.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 256] + - [59, 10185.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 5888] + - [79, 7767.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 64] + - [59, 5789.0] + - - [704, 704, 1, 1280, 704, 704, 704, 704] + - [57, 8319.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 2368] + - [81, 8637.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 256] + - [108, 10788.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 128] + - [94, 9502.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 1856] + - [88, 9953.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 448] + - [59, 7788.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 128] + - [81, 7939.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [58, 5159.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 2944] + - [103, 10174.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 1024] + - [81, 9901.0] + - - [128, 6784, 1, 256, 128, 128, 128, 6784] + - [103, 9073.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 1856] + - [106, 10304.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [103, 7985.0] + - - [704, 1408, 1, 256, 704, 704, 704, 1408] + - [79, 9288.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [103, 6277.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 2944] + - [79, 10571.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 128] + - [65, 10218.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 448] + - [59, 8549.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 4288] + - [94, 9030.0] + - - [448, 704, 1, 3328, 448, 448, 448, 704] + - [101, 9225.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 704] + - [79, 11175.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [55, 5944.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 64] + - [59, 8825.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 64] + - [57, 8742.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [86, 5461.0] + - - [64, 1536, 64, 384, 64, 64, 64, 1536] + - [79, 9684.0] + - - [64, 1536, 64, 256, 64, 64, 64, 1536] + - [70, 9718.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [79, 7195.0] + - - [1024, 1024, 1, 3975, 1024, 1024, 1024, 1024] + - [81, 10693.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [79, 9116.0] + - - [64, 102, 624, 100, 64, 64, 64, 102] + - [79, 7738.0] + - - [64, 112, 576, 111, 64, 64, 64, 112] + - [57, 8487.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [101, 7769.0] + - - [64, 133, 480, 135, 64, 64, 64, 133] + - [57, 7535.0] + - - [1024, 1024, 1, 4026, 1024, 1024, 1024, 1024] + - [103, 10710.0] + - - [64, 160, 400, 159, 64, 64, 64, 160] + - [79, 8963.0] + - - [1024, 1024, 1, 3780, 1024, 1024, 1024, 1024] + - [103, 10646.0] + - - [64, 228, 272, 232, 64, 64, 64, 228] + - [101, 9825.0] + - - [1024, 1024, 1, 3822, 1024, 1024, 1024, 1024] + - [81, 10674.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [79, 6003.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [79, 8873.0] + - - [64, 135, 480, 134, 64, 64, 64, 135] + - [57, 7602.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [79, 7530.0] + - - [1024, 1024, 1, 3942, 1024, 1024, 1024, 1024] + - [81, 10655.0] + - - [1024, 1024, 1, 3861, 1024, 1024, 1024, 1024] + - [81, 10645.0] + - - [1024, 1024, 1, 4000, 1024, 1024, 1024, 1024] + - [81, 10703.0] + - - [1024, 1024, 1, 3870, 1024, 1024, 1024, 1024] + - [81, 10676.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [57, 5182.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [57, 7535.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [79, 9949.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [101, 8372.0] + - - [1024, 1024, 1, 4032, 1024, 1024, 1024, 1024] + - [103, 10685.0] + - - [1024, 1024, 1, 4012, 1024, 1024, 1024, 1024] + - [81, 10664.0] + - - [1024, 1024, 1, 3681, 1024, 1024, 1024, 1024] + - [114, 10658.0] + - - [1024, 1024, 1, 3927, 1024, 1024, 1024, 1024] + - [94, 10655.0] + - - [1024, 1024, 1, 3894, 1024, 1024, 1024, 1024] + - [81, 10655.0] + - - [64, 132, 480, 135, 64, 64, 64, 132] + - [79, 7509.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [79, 7653.0] + - - [1024, 1024, 1, 3876, 1024, 1024, 1024, 1024] + - [103, 10670.0] + - - [64, 84, 752, 85, 64, 64, 64, 84] + - [57, 6624.0] + - - [1024, 1024, 1, 4050, 1024, 1024, 1024, 1024] + - [81, 10681.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [57, 7484.0] + - - [64, 99, 624, 102, 64, 64, 64, 99] + - [57, 7638.0] + - - [64, 143, 432, 148, 64, 64, 64, 143] + - [101, 8018.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 1024, 1024] + - [81, 10678.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [101, 9094.0] + - - [64, 148, 432, 147, 64, 64, 64, 148] + - [79, 8396.0] + - - [1024, 1024, 1, 3960, 1024, 1024, 1024, 1024] + - [81, 10670.0] + - - [64, 123, 528, 122, 64, 64, 64, 123] + - [57, 9062.0] + - - [64, 102, 624, 101, 64, 64, 64, 102] + - [101, 7725.0] + - - [1024, 1024, 1, 3978, 1024, 1024, 1024, 1024] + - [81, 10689.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [79, 9017.0] + - - [1024, 1024, 1, 3995, 1024, 1024, 1024, 1024] + - [103, 10687.0] + - - [64, 132, 480, 134, 64, 64, 64, 132] + - [57, 7581.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [79, 8252.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [79, 7633.0] + - - [1024, 1024, 1, 3977, 1024, 1024, 1024, 1024] + - [81, 10674.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [57, 8472.0] + - - [64, 159, 400, 162, 64, 64, 64, 159] + - [57, 8865.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [101, 9108.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [57, 9689.0] + - - [1024, 1024, 1, 3925, 1024, 1024, 1024, 1024] + - [81, 10665.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [79, 7224.0] + - - [1024, 1024, 1, 3956, 1024, 1024, 1024, 1024] + - [81, 10685.0] + - - [1024, 1024, 1, 3976, 1024, 1024, 1024, 1024] + - [81, 10676.0] + - - [64, 111, 576, 112, 64, 64, 64, 111] + - [79, 8500.0] + - - [64, 100, 624, 102, 64, 64, 64, 100] + - [79, 7706.0] + - - [1024, 1024, 1, 3955, 1024, 1024, 1024, 1024] + - [103, 10679.0] + - - [1024, 1024, 1, 4030, 1024, 1024, 1024, 1024] + - [103, 10661.0] + - - [1024, 1024, 1, 3906, 1024, 1024, 1024, 1024] + - [103, 10659.0] + - - [64, 101, 624, 102, 64, 64, 64, 101] + - [79, 7713.0] + - - [1024, 1024, 1, 3796, 1024, 1024, 1024, 1024] + - [103, 10674.0] + - - [1024, 1024, 1, 3859, 1024, 1024, 1024, 1024] + - [103, 10694.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [57, 5609.0] + - - [1024, 1024, 1, 3860, 1024, 1024, 1024, 1024] + - [114, 10649.0] + - - [1024, 1024, 1, 4005, 1024, 1024, 1024, 1024] + - [81, 10658.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [57, 6597.0] + - - [1024, 1024, 1, 3990, 1024, 1024, 1024, 1024] + - [103, 10662.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [57, 7625.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [101, 6153.0] + - - [1024, 1024, 1, 3999, 1024, 1024, 1024, 1024] + - [103, 10668.0] + - - [1024, 1024, 1, 4020, 1024, 1024, 1024, 1024] + - [103, 10667.0] + - - [1024, 1024, 1, 3939, 1024, 1024, 1024, 1024] + - [103, 10684.0] + - - [64, 77, 816, 78, 64, 64, 64, 77] + - [57, 6121.0] + - - [1024, 1024, 1, 4059, 1024, 1024, 1024, 1024] + - [103, 10655.0] + - - [1024, 1024, 1, 3944, 1024, 1024, 1024, 1024] + - [103, 10674.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [70, 8359.0] + - - [1024, 1024, 1, 3720, 1024, 1024, 1024, 1024] + - [103, 10650.0] + - - [1024, 1024, 1, 3910, 1024, 1024, 1024, 1024] + - [103, 10654.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [57, 8063.0] + - - [64, 92, 688, 93, 64, 64, 64, 92] + - [57, 7242.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [101, 7672.0] + - - [1024, 1024, 1, 3969, 1024, 1024, 1024, 1024] + - [103, 10659.0] + - - [1024, 1024, 1, 3948, 1024, 1024, 1024, 1024] + - [103, 10693.0] + - - [1024, 1024, 1, 3996, 1024, 1024, 1024, 1024] + - [81, 10671.0] + - - [1024, 1024, 1, 3900, 1024, 1024, 1024, 1024] + - [103, 10699.0] + - - [1024, 1024, 1, 3640, 1024, 1024, 1024, 1024] + - [103, 10676.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [79, 8319.0] + - - [1024, 1024, 1, 3751, 1024, 1024, 1024, 1024] + - [103, 10657.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [79, 9800.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [79, 6682.0] + - - [1024, 1024, 1, 3712, 1024, 1024, 1024, 1024] + - [81, 10643.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 1024] + - [59, 8693.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [70, 10885.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [57, 10448.0] + - - [64, 192, 36, 25088, 64, 64, 64, 192] + - [92, 8413.0] + - - [128, 128, 64, 25, 128, 128, 128, 128] + - [68, 3766.0] + - - [64, 192, 64, 3200, 64, 64, 64, 192] + - [92, 8753.0] + - - [64, 128, 64, 23104, 64, 64, 64, 128] + - [78, 7188.0] + - - [128, 128, 64, 1600, 128, 128, 128, 128] + - [74, 10363.0] + - - [80, 192, 64, 4608, 80, 80, 80, 192] + - [63, 7066.0] + - - [64, 128, 36, 30, 64, 64, 64, 128] + - [118, 2280.0] + - - [64, 128, 64, 11552, 64, 64, 64, 128] + - [85, 7149.0] + - - [128, 192, 64, 946, 128, 128, 128, 192] + - [81, 11023.0] + - - [64, 192, 64, 12800, 64, 64, 64, 192] + - [79, 8212.0] + - - [224, 224, 64, 128, 224, 224, 224, 224] + - [57, 8101.0] + - - [128, 128, 64, 3360, 128, 128, 128, 128] + - [106, 10322.0] + - - [128, 128, 64, 420, 128, 128, 128, 128] + - [103, 9923.0] + - - [64, 128, 64, 361, 64, 64, 64, 128] + - [57, 7565.0] + - - [64, 128, 36, 53824, 64, 64, 64, 128] + - [113, 7649.0] + - - [128, 160, 36, 512, 128, 128, 128, 160] + - [81, 7998.0] + - - [147, 64, 36, 18816, 147, 147, 147, 64] + - [105, 7311.0] + - - [96, 128, 64, 946, 96, 96, 96, 128] + - [59, 7689.0] + - - [128, 128, 64, 50, 128, 128, 128, 128] + - [58, 5601.0] + - - [160, 224, 36, 128, 160, 160, 160, 224] + - [57, 6659.0] + - - [192, 224, 64, 1152, 192, 192, 192, 224] + - [79, 10064.0] + - - [128, 128, 36, 784, 128, 128, 128, 128] + - [82, 9606.0] + - - [96, 128, 64, 288, 96, 96, 96, 128] + - [57, 6948.0] + - - [128, 128, 64, 400, 128, 128, 128, 128] + - [103, 9906.0] + - - [128, 128, 64, 800, 128, 128, 128, 128] + - [94, 10223.0] + - - [96, 128, 36, 512, 96, 96, 96, 128] + - [103, 6599.0] + - - [96, 128, 64, 800, 96, 96, 96, 128] + - [81, 7630.0] + - - [192, 224, 64, 128, 192, 192, 192, 224] + - [57, 8969.0] + - - [128, 128, 64, 288, 128, 128, 128, 128] + - [81, 9636.0] + - - [96, 208, 36, 512, 96, 96, 96, 208] + - [81, 6114.0] + - - [64, 128, 36, 1568, 64, 64, 64, 128] + - [79, 8249.0] + - - [192, 192, 36, 512, 192, 192, 192, 192] + - [79, 10230.0] + - - [128, 128, 36, 512, 128, 128, 128, 128] + - [104, 9218.0] + - - [96, 208, 64, 1152, 96, 96, 96, 208] + - [103, 6985.0] + - - [128, 192, 64, 3200, 128, 128, 128, 192] + - [86, 11258.0] + - - [160, 160, 64, 288, 160, 160, 160, 160] + - [79, 7298.0] + - - [128, 128, 36, 440, 128, 128, 128, 128] + - [60, 9030.0] + - - [96, 128, 36, 1568, 96, 96, 96, 128] + - [60, 7352.0] + - - [112, 224, 36, 2048, 112, 112, 112, 224] + - [108, 8232.0] + - - [128, 128, 36, 7040, 128, 128, 128, 128] + - [98, 10414.0] + - - [128, 128, 36, 1568, 128, 128, 128, 128] + - [82, 10020.0] + - - [160, 224, 64, 128, 160, 160, 160, 224] + - [79, 7422.0] + - - [192, 224, 36, 2592, 192, 192, 192, 224] + - [79, 9733.0] + - - [64, 128, 64, 2888, 64, 64, 64, 128] + - [70, 8050.0] + - - [64, 128, 36, 480, 64, 64, 64, 128] + - [79, 7078.0] + - - [147, 64, 64, 9702, 147, 147, 147, 64] + - [113, 7489.0] + - - [64, 192, 64, 3698, 64, 64, 64, 192] + - [70, 8677.0] + - - [73, 192, 64, 10439, 73, 73, 73, 192] + - [106, 6481.0] + - - [128, 128, 36, 880, 128, 128, 128, 128] + - [104, 9705.0] + - - [192, 224, 36, 128, 192, 192, 192, 224] + - [57, 8122.0] + - - [64, 128, 36, 12544, 64, 64, 64, 128] + - [57, 7728.0] + - - [160, 160, 36, 512, 160, 160, 160, 160] + - [57, 7077.0] + - - [128, 128, 36, 3136, 128, 128, 128, 128] + - [88, 10225.0] + - - [112, 224, 36, 512, 112, 112, 112, 224] + - [59, 7648.0] + - - [128, 128, 36, 49, 128, 128, 128, 128] + - [99, 4314.0] + - - [112, 224, 64, 1152, 112, 112, 112, 224] + - [103, 8728.0] + - - [128, 192, 36, 1568, 128, 128, 128, 192] + - [59, 10147.0] + - - [128, 192, 36, 512, 128, 128, 128, 192] + - [81, 9746.0] + - - [192, 192, 64, 288, 192, 192, 192, 192] + - [101, 10650.0] + - - [96, 208, 64, 242, 96, 96, 96, 208] + - [70, 6327.0] + - - [64, 128, 64, 5776, 64, 64, 64, 128] + - [80, 7144.0] + - - [128, 192, 64, 288, 128, 128, 128, 192] + - [81, 10535.0] + - - [96, 128, 36, 6272, 96, 96, 96, 128] + - [76, 7748.0] + - - [96, 128, 64, 3200, 96, 96, 96, 128] + - [74, 7874.0] + - - [128, 192, 64, 800, 128, 128, 128, 192] + - [81, 11003.0] + - - [64, 128, 64, 10, 64, 64, 64, 128] + - [64, 1498.0] + - - [96, 208, 64, 288, 96, 96, 96, 208] + - [103, 6540.0] + - - [64, 128, 64, 160, 64, 64, 64, 128] + - [57, 6433.0] + - - [128, 128, 64, 1568, 128, 128, 128, 128] + - [74, 10355.0] + - - [112, 224, 64, 242, 112, 112, 112, 224] + - [101, 8031.0] + - - [160, 192, 64, 288, 160, 160, 160, 192] + - [113, 8722.0] + - - [128, 160, 64, 288, 128, 128, 128, 160] + - [81, 8714.0] + - - [128, 128, 64, 210, 128, 128, 128, 128] + - [103, 9260.0] + - - [73, 192, 36, 23360, 73, 73, 73, 192] + - [63, 5965.0] + - - [160, 192, 36, 512, 160, 160, 160, 192] + - [79, 8451.0] + - - [64, 128, 64, 722, 64, 64, 64, 128] + - [79, 8130.0] + - - [112, 224, 64, 288, 112, 112, 112, 224] + - [101, 8096.0] + - - [64, 192, 36, 6272, 64, 64, 64, 192] + - [79, 8369.0] + - - [64, 128, 36, 6272, 64, 64, 64, 128] + - [109, 8203.0] + - - [128, 128, 36, 3200, 128, 128, 128, 128] + - [65, 10212.0] + - - [128, 128, 36, 392, 128, 128, 128, 128] + - [60, 8893.0] + - - [80, 192, 36, 10368, 80, 80, 80, 192] + - [106, 6508.0] + - - [224, 224, 36, 128, 224, 224, 224, 224] + - [57, 7531.0] + - - [64, 128, 36, 784, 64, 64, 64, 128] + - [79, 7702.0] + - - [128, 128, 64, 200, 128, 128, 128, 128] + - [81, 9304.0] + - - [5329, 64, 32, 80, 5329, 5329, 5329, 64] + - [81, 10565.0] + - - [64, 2048, 32, 384, 64, 64, 64, 2048] + - [101, 11197.0] + - - [289, 1792, 1, 320, 289, 289, 289, 1792] + - [79, 7256.0] + - - [1001, 1024, 1, 32, 1001, 1001, 1001, 1024] + - [107, 4481.0] + - - [784, 400, 1, 32, 784, 784, 784, 400] + - [99, 2472.0] + - - [64, 1536, 32, 256, 64, 64, 64, 1536] + - [79, 11148.0] + - - [289, 2592, 1, 384, 289, 289, 289, 2592] + - [101, 7758.0] + - - [64, 2048, 32, 448, 64, 64, 64, 2048] + - [92, 10438.0] + - - [289, 2016, 1, 256, 289, 289, 289, 2016] + - [79, 7673.0] + - - [64, 1536, 32, 384, 64, 64, 64, 1536] + - [79, 11237.0] + - - [64, 1280, 32, 320, 64, 64, 64, 1280] + - [57, 10974.0] + - - [289, 3456, 1, 384, 289, 289, 289, 3456] + - [57, 9050.0] + - - [64, 1280, 32, 384, 64, 64, 64, 1280] + - [57, 11126.0] + - - [729, 1600, 1, 192, 729, 729, 729, 1600] + - [81, 9111.0] + - - [289, 1344, 1, 192, 289, 289, 289, 1344] + - [57, 5728.0] + - - [64, 2048, 32, 320, 64, 64, 64, 2048] + - [113, 11267.0] + - - [64, 1280, 32, 448, 64, 64, 64, 1280] + - [57, 11126.0] + - - [64, 1280, 32, 192, 64, 64, 64, 1280] + - [70, 10704.0] + - - [289, 1792, 1, 256, 289, 289, 289, 1792] + - [57, 6956.0] + - - [64, 2048, 32, 192, 64, 64, 64, 2048] + - [92, 11117.0] + - - [5329, 64, 128, 80, 5329, 5329, 5329, 64] + - [84, 5695.0] + - - [64, 1280, 128, 448, 64, 64, 64, 1280] + - [101, 9539.0] + - - [64, 2048, 128, 192, 64, 64, 64, 2048] + - [70, 8299.0] + - - [64, 1280, 128, 384, 64, 64, 64, 1280] + - [101, 9250.0] + - - [64, 1280, 128, 320, 64, 64, 64, 1280] + - [57, 8903.0] + - - [64, 1280, 128, 192, 64, 64, 64, 1280] + - [75, 8039.0] + - - [256, 4096, 1, 6400, 256, 256, 256, 4096] + - [114, 10684.0] + - - [512, 2048, 1, 3427, 512, 512, 512, 2048] + - [81, 10652.0] + - - [512, 2048, 1, 3552, 512, 512, 512, 2048] + - [114, 10676.0] + - - [512, 2048, 1, 3840, 512, 512, 512, 2048] + - [103, 10691.0] + - - [2048, 512, 1, 3427, 2048, 2048, 2048, 512] + - [81, 10657.0] + - - [2048, 512, 1, 3452, 2048, 2048, 2048, 512] + - [81, 10655.0] + - - [2048, 512, 1, 3472, 2048, 2048, 2048, 512] + - [81, 10656.0] + - - [2048, 512, 1, 3475, 2048, 2048, 2048, 512] + - [114, 10643.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [93, 6865.0] + - - [64, 64, 496, 65, 64, 64, 64, 64] + - [70, 7092.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [101, 4680.0] + - - [64, 71, 448, 71, 64, 64, 64, 71] + - [101, 5086.0] + - - [64, 77, 408, 77, 64, 64, 64, 77] + - [101, 5394.0] + - - [64, 77, 408, 78, 64, 64, 64, 77] + - [57, 5386.0] + - - [64, 78, 408, 78, 64, 64, 64, 78] + - [101, 5422.0] + - - [64, 85, 376, 85, 64, 64, 64, 85] + - [79, 5926.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [101, 6481.0] + - - [64, 112, 288, 112, 64, 64, 64, 112] + - [101, 7697.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [57, 7913.0] + - - [64, 123, 264, 122, 64, 64, 64, 123] + - [79, 8044.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [101, 8105.0] + - - [64, 134, 240, 134, 64, 64, 64, 134] + - [79, 6919.0] + - - [64, 135, 240, 134, 64, 64, 64, 135] + - [57, 6996.0] + - - [64, 135, 240, 135, 64, 64, 64, 135] + - [57, 6977.0] + - - [64, 1280, 64, 192, 64, 64, 64, 1280] + - [57, 11222.0] + - - [64, 1280, 64, 320, 64, 64, 64, 1280] + - [57, 10531.0] + - - [64, 1280, 64, 384, 64, 64, 64, 1280] + - [79, 9697.0] + - - [64, 1280, 64, 448, 64, 64, 64, 1280] + - [101, 9656.0] + - - [64, 2048, 64, 192, 64, 64, 64, 2048] + - [70, 8659.0] + - - [64, 2048, 64, 320, 64, 64, 64, 2048] + - [101, 9453.0] + - - [64, 2048, 64, 384, 64, 64, 64, 2048] + - [57, 9732.0] + - - [64, 2048, 64, 448, 64, 64, 64, 2048] + - [101, 9936.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [79, 10763.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 64] + - [103, 8886.0] + - - [5329, 64, 64, 80, 5329, 5329, 5329, 64] + - [110, 6100.0] + - - [257, 4096, 1, 1024, 257, 257, 257, 4096] + - [57, 8922.0] + - - [512, 2048, 1, 2790, 512, 512, 512, 2048] + - [59, 10627.0] + - - [512, 2048, 1, 2864, 512, 512, 512, 2048] + - [103, 10670.0] + - - [512, 2048, 1, 3092, 512, 512, 512, 2048] + - [81, 10634.0] + - - [512, 2048, 1, 3113, 512, 512, 512, 2048] + - [103, 10655.0] + - - [512, 2048, 1, 3137, 512, 512, 512, 2048] + - [103, 10664.0] + - - [512, 2048, 1, 3165, 512, 512, 512, 2048] + - [103, 10664.0] + - - [512, 2048, 1, 3166, 512, 512, 512, 2048] + - [103, 10640.0] + - - [512, 2048, 1, 3194, 512, 512, 512, 2048] + - [114, 10642.0] + - - [512, 2048, 1, 3219, 512, 512, 512, 2048] + - [103, 10640.0] + - - [512, 2048, 1, 3222, 512, 512, 512, 2048] + - [103, 10650.0] + - - [512, 2048, 1, 3234, 512, 512, 512, 2048] + - [103, 10656.0] + - - [512, 2048, 1, 3237, 512, 512, 512, 2048] + - [103, 10687.0] + - - [512, 2048, 1, 3242, 512, 512, 512, 2048] + - [103, 10637.0] + - - [512, 2048, 1, 3246, 512, 512, 512, 2048] + - [103, 10657.0] + - - [512, 2048, 1, 3249, 512, 512, 512, 2048] + - [81, 10647.0] + - - [512, 2048, 1, 3251, 512, 512, 512, 2048] + - [72, 10645.0] + - - [512, 2048, 1, 3257, 512, 512, 512, 2048] + - [103, 10653.0] + - - [512, 2048, 1, 3262, 512, 512, 512, 2048] + - [81, 10634.0] + - - [512, 2048, 1, 3268, 512, 512, 512, 2048] + - [103, 10659.0] + - - [512, 2048, 1, 3282, 512, 512, 512, 2048] + - [103, 10664.0] + - - [512, 2048, 1, 3286, 512, 512, 512, 2048] + - [81, 10665.0] + - - [512, 2048, 1, 3287, 512, 512, 512, 2048] + - [103, 10669.0] + - - [512, 2048, 1, 3293, 512, 512, 512, 2048] + - [81, 10651.0] + - - [512, 2048, 1, 3297, 512, 512, 512, 2048] + - [81, 10641.0] + - - [512, 2048, 1, 3307, 512, 512, 512, 2048] + - [103, 10635.0] + - - [512, 2048, 1, 3314, 512, 512, 512, 2048] + - [81, 10651.0] + - - [512, 2048, 1, 3315, 512, 512, 512, 2048] + - [94, 10637.0] + - - [512, 2048, 1, 3319, 512, 512, 512, 2048] + - [103, 10652.0] + - - [512, 2048, 1, 3322, 512, 512, 512, 2048] + - [103, 10682.0] + - - [512, 2048, 1, 3323, 512, 512, 512, 2048] + - [81, 10660.0] + - - [512, 2048, 1, 3324, 512, 512, 512, 2048] + - [103, 10660.0] + - - [512, 2048, 1, 3325, 512, 512, 512, 2048] + - [81, 10658.0] + - - [512, 2048, 1, 3327, 512, 512, 512, 2048] + - [103, 10666.0] + - - [512, 2048, 1, 3329, 512, 512, 512, 2048] + - [103, 10674.0] + - - [512, 2048, 1, 3332, 512, 512, 512, 2048] + - [94, 10636.0] + - - [512, 2048, 1, 3336, 512, 512, 512, 2048] + - [114, 10649.0] + - - [512, 2048, 1, 3339, 512, 512, 512, 2048] + - [81, 10659.0] + - - [512, 2048, 1, 3342, 512, 512, 512, 2048] + - [103, 10668.0] + - - [512, 2048, 1, 3344, 512, 512, 512, 2048] + - [103, 10667.0] + - - [512, 2048, 1, 3358, 512, 512, 512, 2048] + - [103, 10660.0] + - - [512, 2048, 1, 3360, 512, 512, 512, 2048] + - [103, 10666.0] + - - [512, 2048, 1, 3364, 512, 512, 512, 2048] + - [114, 10651.0] + - - [512, 2048, 1, 3365, 512, 512, 512, 2048] + - [114, 10665.0] + - - [512, 2048, 1, 3369, 512, 512, 512, 2048] + - [103, 10655.0] + - - [512, 2048, 1, 3371, 512, 512, 512, 2048] + - [103, 10682.0] + - - [512, 2048, 1, 3374, 512, 512, 512, 2048] + - [81, 10640.0] + - - [512, 2048, 1, 3376, 512, 512, 512, 2048] + - [103, 10653.0] + - - [512, 2048, 1, 3377, 512, 512, 512, 2048] + - [72, 10635.0] + - - [512, 2048, 1, 3378, 512, 512, 512, 2048] + - [103, 10650.0] + - - [512, 2048, 1, 3381, 512, 512, 512, 2048] + - [81, 10645.0] + - - [512, 2048, 1, 3382, 512, 512, 512, 2048] + - [103, 10678.0] + - - [512, 2048, 1, 3383, 512, 512, 512, 2048] + - [81, 10658.0] + - - [512, 2048, 1, 3384, 512, 512, 512, 2048] + - [103, 10691.0] + - - [512, 2048, 1, 3385, 512, 512, 512, 2048] + - [103, 10662.0] + - - [512, 2048, 1, 3386, 512, 512, 512, 2048] + - [81, 10664.0] + - - [512, 2048, 1, 3388, 512, 512, 512, 2048] + - [103, 10694.0] + - - [512, 2048, 1, 3390, 512, 512, 512, 2048] + - [103, 10680.0] + - - [512, 2048, 1, 3391, 512, 512, 512, 2048] + - [81, 10655.0] + - - [512, 2048, 1, 3396, 512, 512, 512, 2048] + - [103, 10663.0] + - - [512, 2048, 1, 3399, 512, 512, 512, 2048] + - [81, 10646.0] + - - [512, 2048, 1, 3402, 512, 512, 512, 2048] + - [103, 10674.0] + - - [512, 2048, 1, 3410, 512, 512, 512, 2048] + - [114, 10655.0] + - - [512, 2048, 1, 3412, 512, 512, 512, 2048] + - [114, 10644.0] + - - [512, 2048, 1, 3414, 512, 512, 512, 2048] + - [103, 10664.0] + - - [512, 2048, 1, 3415, 512, 512, 512, 2048] + - [81, 10665.0] + - - [512, 2048, 1, 3418, 512, 512, 512, 2048] + - [103, 10666.0] + - - [512, 2048, 1, 3420, 512, 512, 512, 2048] + - [81, 10648.0] + - - [512, 2048, 1, 3422, 512, 512, 512, 2048] + - [103, 10658.0] + - - [512, 2048, 1, 3425, 512, 512, 512, 2048] + - [114, 10663.0] + - - [512, 2048, 1, 3426, 512, 512, 512, 2048] + - [81, 10616.0] + - - [512, 2048, 1, 3428, 512, 512, 512, 2048] + - [81, 10649.0] + - - [512, 2048, 1, 3430, 512, 512, 512, 2048] + - [81, 10663.0] + - - [512, 2048, 1, 3431, 512, 512, 512, 2048] + - [103, 10679.0] + - - [512, 2048, 1, 3432, 512, 512, 512, 2048] + - [103, 10679.0] + - - [512, 2048, 1, 3438, 512, 512, 512, 2048] + - [81, 10666.0] + - - [512, 2048, 1, 3439, 512, 512, 512, 2048] + - [103, 10679.0] + - - [512, 2048, 1, 3440, 512, 512, 512, 2048] + - [114, 10678.0] + - - [512, 2048, 1, 3443, 512, 512, 512, 2048] + - [103, 10639.0] + - - [512, 2048, 1, 3445, 512, 512, 512, 2048] + - [81, 10643.0] + - - [512, 2048, 1, 3447, 512, 512, 512, 2048] + - [103, 10634.0] + - - [512, 2048, 1, 3448, 512, 512, 512, 2048] + - [103, 10648.0] + - - [512, 2048, 1, 3450, 512, 512, 512, 2048] + - [103, 10668.0] + - - [512, 2048, 1, 3451, 512, 512, 512, 2048] + - [94, 10664.0] + - - [512, 2048, 1, 3452, 512, 512, 512, 2048] + - [103, 10671.0] + - - [512, 2048, 1, 3453, 512, 512, 512, 2048] + - [114, 10658.0] + - - [512, 2048, 1, 3455, 512, 512, 512, 2048] + - [103, 10612.0] + - - [512, 2048, 1, 3456, 512, 512, 512, 2048] + - [103, 10645.0] + - - [512, 2048, 1, 3457, 512, 512, 512, 2048] + - [103, 10625.0] + - - [512, 2048, 1, 3458, 512, 512, 512, 2048] + - [103, 10634.0] + - - [512, 2048, 1, 3459, 512, 512, 512, 2048] + - [81, 10650.0] + - - [512, 2048, 1, 3460, 512, 512, 512, 2048] + - [103, 10635.0] + - - [512, 2048, 1, 3461, 512, 512, 512, 2048] + - [103, 10674.0] + - - [512, 2048, 1, 3462, 512, 512, 512, 2048] + - [103, 10634.0] + - - [512, 2048, 1, 3466, 512, 512, 512, 2048] + - [103, 10634.0] + - - [512, 2048, 1, 3467, 512, 512, 512, 2048] + - [103, 10637.0] + - - [512, 2048, 1, 3468, 512, 512, 512, 2048] + - [103, 10640.0] + - - [512, 2048, 1, 3470, 512, 512, 512, 2048] + - [103, 10648.0] + - - [512, 2048, 1, 3471, 512, 512, 512, 2048] + - [81, 10623.0] + - - [512, 2048, 1, 3472, 512, 512, 512, 2048] + - [103, 10651.0] + - - [512, 2048, 1, 3475, 512, 512, 512, 2048] + - [103, 10636.0] + - - [512, 2048, 1, 3476, 512, 512, 512, 2048] + - [103, 10661.0] + - - [512, 2048, 1, 3477, 512, 512, 512, 2048] + - [103, 10625.0] + - - [512, 2048, 1, 3478, 512, 512, 512, 2048] + - [103, 10622.0] + - - [512, 2048, 1, 3479, 512, 512, 512, 2048] + - [59, 10647.0] + - - [512, 2048, 1, 3480, 512, 512, 512, 2048] + - [103, 10670.0] + - - [512, 2048, 1, 3481, 512, 512, 512, 2048] + - [103, 10655.0] + - - [512, 2048, 1, 3483, 512, 512, 512, 2048] + - [81, 10643.0] + - - [512, 2048, 1, 3484, 512, 512, 512, 2048] + - [59, 10630.0] + - - [512, 2048, 1, 3487, 512, 512, 512, 2048] + - [103, 10630.0] + - - [512, 2048, 1, 3489, 512, 512, 512, 2048] + - [103, 10648.0] + - - [512, 2048, 1, 3490, 512, 512, 512, 2048] + - [114, 10647.0] + - - [512, 2048, 1, 3491, 512, 512, 512, 2048] + - [103, 10630.0] + - - [512, 2048, 1, 3493, 512, 512, 512, 2048] + - [103, 10649.0] + - - [512, 2048, 1, 3494, 512, 512, 512, 2048] + - [103, 10644.0] + - - [512, 2048, 1, 3495, 512, 512, 512, 2048] + - [103, 10637.0] + - - [512, 2048, 1, 3497, 512, 512, 512, 2048] + - [81, 10646.0] + - - [512, 2048, 1, 3498, 512, 512, 512, 2048] + - [103, 10632.0] + - - [512, 2048, 1, 3499, 512, 512, 512, 2048] + - [103, 10634.0] + - - [512, 2048, 1, 3501, 512, 512, 512, 2048] + - [103, 10660.0] + - - [512, 2048, 1, 3503, 512, 512, 512, 2048] + - [114, 10644.0] + - - [512, 2048, 1, 3507, 512, 512, 512, 2048] + - [103, 10652.0] + - - [512, 2048, 1, 3508, 512, 512, 512, 2048] + - [59, 10641.0] + - - [512, 2048, 1, 3509, 512, 512, 512, 2048] + - [103, 10636.0] + - - [512, 2048, 1, 3511, 512, 512, 512, 2048] + - [103, 10650.0] + - - [512, 2048, 1, 3514, 512, 512, 512, 2048] + - [81, 10664.0] + - - [512, 2048, 1, 3515, 512, 512, 512, 2048] + - [72, 10629.0] + - - [512, 2048, 1, 3517, 512, 512, 512, 2048] + - [103, 10626.0] + - - [512, 2048, 1, 3518, 512, 512, 512, 2048] + - [114, 10621.0] + - - [512, 2048, 1, 3519, 512, 512, 512, 2048] + - [103, 10648.0] + - - [512, 2048, 1, 3520, 512, 512, 512, 2048] + - [103, 10644.0] + - - [512, 2048, 1, 3523, 512, 512, 512, 2048] + - [103, 10653.0] + - - [512, 2048, 1, 3528, 512, 512, 512, 2048] + - [103, 10658.0] + - - [512, 2048, 1, 3529, 512, 512, 512, 2048] + - [103, 10646.0] + - - [512, 2048, 1, 3530, 512, 512, 512, 2048] + - [72, 10634.0] + - - [512, 2048, 1, 3532, 512, 512, 512, 2048] + - [81, 10639.0] + - - [512, 2048, 1, 3533, 512, 512, 512, 2048] + - [103, 10647.0] + - - [512, 2048, 1, 3534, 512, 512, 512, 2048] + - [103, 10647.0] + - - [512, 2048, 1, 3538, 512, 512, 512, 2048] + - [103, 10675.0] + - - [512, 2048, 1, 3539, 512, 512, 512, 2048] + - [103, 10639.0] + - - [512, 2048, 1, 3541, 512, 512, 512, 2048] + - [114, 10642.0] + - - [512, 2048, 1, 3547, 512, 512, 512, 2048] + - [103, 10658.0] + - - [512, 2048, 1, 3548, 512, 512, 512, 2048] + - [103, 10666.0] + - - [512, 2048, 1, 3564, 512, 512, 512, 2048] + - [103, 10692.0] + - - [512, 2048, 1, 3575, 512, 512, 512, 2048] + - [103, 10642.0] + - - [512, 2048, 1, 3598, 512, 512, 512, 2048] + - [103, 10671.0] + - - [512, 2048, 1, 3599, 512, 512, 512, 2048] + - [72, 10688.0] + - - [512, 2048, 1, 3608, 512, 512, 512, 2048] + - [114, 10676.0] + - - [512, 2048, 1, 3780, 512, 512, 512, 2048] + - [81, 10660.0] + - - [512, 2048, 1, 3796, 512, 512, 512, 2048] + - [81, 10661.0] + - - [512, 2048, 1, 3822, 512, 512, 512, 2048] + - [81, 10668.0] + - - [512, 2048, 1, 3859, 512, 512, 512, 2048] + - [103, 10663.0] + - - [512, 2048, 1, 3870, 512, 512, 512, 2048] + - [81, 10680.0] + - - [512, 2048, 1, 3876, 512, 512, 512, 2048] + - [81, 10635.0] + - - [512, 2048, 1, 3906, 512, 512, 512, 2048] + - [81, 10692.0] + - - [512, 2048, 1, 3910, 512, 512, 512, 2048] + - [114, 10667.0] + - - [512, 2048, 1, 3925, 512, 512, 512, 2048] + - [81, 10682.0] + - - [512, 2048, 1, 3942, 512, 512, 512, 2048] + - [114, 10657.0] + - - [512, 2048, 1, 3944, 512, 512, 512, 2048] + - [103, 10679.0] + - - [512, 2048, 1, 3955, 512, 512, 512, 2048] + - [114, 10634.0] + - - [512, 2048, 1, 3968, 512, 512, 512, 2048] + - [59, 10665.0] + - - [512, 2048, 1, 3969, 512, 512, 512, 2048] + - [103, 10681.0] + - - [512, 2048, 1, 3976, 512, 512, 512, 2048] + - [103, 10684.0] + - - [512, 2048, 1, 3977, 512, 512, 512, 2048] + - [81, 10680.0] + - - [512, 2048, 1, 3978, 512, 512, 512, 2048] + - [114, 10676.0] + - - [512, 2048, 1, 3990, 512, 512, 512, 2048] + - [103, 10655.0] + - - [512, 2048, 1, 3995, 512, 512, 512, 2048] + - [114, 10666.0] + - - [512, 2048, 1, 3996, 512, 512, 512, 2048] + - [81, 10673.0] + - - [512, 2048, 1, 3999, 512, 512, 512, 2048] + - [103, 10694.0] + - - [512, 2048, 1, 4005, 512, 512, 512, 2048] + - [81, 10674.0] + - - [512, 2048, 1, 4012, 512, 512, 512, 2048] + - [103, 10655.0] + - - [512, 2048, 1, 4020, 512, 512, 512, 2048] + - [59, 10658.0] + - - [512, 2048, 1, 4026, 512, 512, 512, 2048] + - [103, 10695.0] + - - [512, 2048, 1, 4030, 512, 512, 512, 2048] + - [114, 10667.0] + - - [512, 2048, 1, 4032, 512, 512, 512, 2048] + - [103, 10676.0] + - - [2048, 512, 1, 2790, 2048, 2048, 2048, 512] + - [114, 10643.0] + - - [2048, 512, 1, 2864, 2048, 2048, 2048, 512] + - [103, 10661.0] + - - [2048, 512, 1, 3092, 2048, 2048, 2048, 512] + - [81, 10636.0] + - - [2048, 512, 1, 3113, 2048, 2048, 2048, 512] + - [81, 10634.0] + - - [2048, 512, 1, 3137, 2048, 2048, 2048, 512] + - [103, 10660.0] + - - [2048, 512, 1, 3165, 2048, 2048, 2048, 512] + - [94, 10634.0] + - - [2048, 512, 1, 3166, 2048, 2048, 2048, 512] + - [81, 10648.0] + - - [2048, 512, 1, 3194, 2048, 2048, 2048, 512] + - [94, 10635.0] + - - [2048, 512, 1, 3219, 2048, 2048, 2048, 512] + - [103, 10652.0] + - - [2048, 512, 1, 3222, 2048, 2048, 2048, 512] + - [81, 10658.0] + - - [2048, 512, 1, 3234, 2048, 2048, 2048, 512] + - [81, 10652.0] + - - [2048, 512, 1, 3237, 2048, 2048, 2048, 512] + - [114, 10624.0] + - - [2048, 512, 1, 3242, 2048, 2048, 2048, 512] + - [103, 10673.0] + - - [2048, 512, 1, 3246, 2048, 2048, 2048, 512] + - [103, 10661.0] + - - [2048, 512, 1, 3249, 2048, 2048, 2048, 512] + - [103, 10654.0] + - - [2048, 512, 1, 3251, 2048, 2048, 2048, 512] + - [103, 10650.0] + - - [2048, 512, 1, 3257, 2048, 2048, 2048, 512] + - [103, 10663.0] + - - [2048, 512, 1, 3262, 2048, 2048, 2048, 512] + - [103, 10679.0] + - - [2048, 512, 1, 3268, 2048, 2048, 2048, 512] + - [81, 10668.0] + - - [2048, 512, 1, 3282, 2048, 2048, 2048, 512] + - [103, 10678.0] + - - [2048, 512, 1, 3286, 2048, 2048, 2048, 512] + - [114, 10648.0] + - - [2048, 512, 1, 3287, 2048, 2048, 2048, 512] + - [94, 10620.0] + - - [2048, 512, 1, 3293, 2048, 2048, 2048, 512] + - [114, 10649.0] + - - [2048, 512, 1, 3297, 2048, 2048, 2048, 512] + - [103, 10642.0] + - - [2048, 512, 1, 3307, 2048, 2048, 2048, 512] + - [114, 10652.0] + - - [2048, 512, 1, 3314, 2048, 2048, 2048, 512] + - [59, 10650.0] + - - [2048, 512, 1, 3315, 2048, 2048, 2048, 512] + - [81, 10656.0] + - - [2048, 512, 1, 3319, 2048, 2048, 2048, 512] + - [103, 10670.0] + - - [2048, 512, 1, 3322, 2048, 2048, 2048, 512] + - [81, 10643.0] + - - [2048, 512, 1, 3323, 2048, 2048, 2048, 512] + - [103, 10639.0] + - - [2048, 512, 1, 3324, 2048, 2048, 2048, 512] + - [103, 10645.0] + - - [2048, 512, 1, 3325, 2048, 2048, 2048, 512] + - [103, 10657.0] + - - [2048, 512, 1, 3327, 2048, 2048, 2048, 512] + - [103, 10656.0] + - - [2048, 512, 1, 3329, 2048, 2048, 2048, 512] + - [114, 10648.0] + - - [2048, 512, 1, 3332, 2048, 2048, 2048, 512] + - [103, 10677.0] + - - [2048, 512, 1, 3336, 2048, 2048, 2048, 512] + - [103, 10652.0] + - - [2048, 512, 1, 3339, 2048, 2048, 2048, 512] + - [81, 10648.0] + - - [2048, 512, 1, 3342, 2048, 2048, 2048, 512] + - [103, 10666.0] + - - [2048, 512, 1, 3344, 2048, 2048, 2048, 512] + - [103, 10653.0] + - - [2048, 512, 1, 3358, 2048, 2048, 2048, 512] + - [81, 10646.0] + - - [2048, 512, 1, 3360, 2048, 2048, 2048, 512] + - [114, 10651.0] + - - [2048, 512, 1, 3364, 2048, 2048, 2048, 512] + - [103, 10649.0] + - - [2048, 512, 1, 3365, 2048, 2048, 2048, 512] + - [81, 10655.0] + - - [2048, 512, 1, 3369, 2048, 2048, 2048, 512] + - [81, 10642.0] + - - [2048, 512, 1, 3371, 2048, 2048, 2048, 512] + - [103, 10665.0] + - - [2048, 512, 1, 3374, 2048, 2048, 2048, 512] + - [81, 10655.0] + - - [2048, 512, 1, 3376, 2048, 2048, 2048, 512] + - [81, 10638.0] + - - [2048, 512, 1, 3377, 2048, 2048, 2048, 512] + - [103, 10666.0] + - - [2048, 512, 1, 3378, 2048, 2048, 2048, 512] + - [81, 10654.0] + - - [2048, 512, 1, 3381, 2048, 2048, 2048, 512] + - [103, 10636.0] + - - [2048, 512, 1, 3382, 2048, 2048, 2048, 512] + - [94, 10637.0] + - - [2048, 512, 1, 3383, 2048, 2048, 2048, 512] + - [81, 10639.0] + - - [2048, 512, 1, 3384, 2048, 2048, 2048, 512] + - [81, 10655.0] + - - [2048, 512, 1, 3385, 2048, 2048, 2048, 512] + - [103, 10683.0] + - - [2048, 512, 1, 3386, 2048, 2048, 2048, 512] + - [81, 10658.0] + - - [2048, 512, 1, 3388, 2048, 2048, 2048, 512] + - [103, 10677.0] + - - [2048, 512, 1, 3390, 2048, 2048, 2048, 512] + - [103, 10653.0] + - - [2048, 512, 1, 3391, 2048, 2048, 2048, 512] + - [81, 10651.0] + - - [2048, 512, 1, 3396, 2048, 2048, 2048, 512] + - [81, 10667.0] + - - [2048, 512, 1, 3399, 2048, 2048, 2048, 512] + - [103, 10672.0] + - - [2048, 512, 1, 3402, 2048, 2048, 2048, 512] + - [94, 10627.0] + - - [2048, 512, 1, 3410, 2048, 2048, 2048, 512] + - [103, 10642.0] + - - [2048, 512, 1, 3412, 2048, 2048, 2048, 512] + - [81, 10664.0] + - - [2048, 512, 1, 3414, 2048, 2048, 2048, 512] + - [81, 10642.0] + - - [2048, 512, 1, 3415, 2048, 2048, 2048, 512] + - [81, 10671.0] + - - [2048, 512, 1, 3418, 2048, 2048, 2048, 512] + - [81, 10669.0] + - - [2048, 512, 1, 3420, 2048, 2048, 2048, 512] + - [81, 10650.0] + - - [2048, 512, 1, 3422, 2048, 2048, 2048, 512] + - [81, 10632.0] + - - [2048, 512, 1, 3425, 2048, 2048, 2048, 512] + - [81, 10654.0] + - - [2048, 512, 1, 3426, 2048, 2048, 2048, 512] + - [81, 10643.0] + - - [2048, 512, 1, 3428, 2048, 2048, 2048, 512] + - [94, 10631.0] + - - [2048, 512, 1, 3430, 2048, 2048, 2048, 512] + - [81, 10685.0] + - - [2048, 512, 1, 3431, 2048, 2048, 2048, 512] + - [103, 10680.0] + - - [2048, 512, 1, 3432, 2048, 2048, 2048, 512] + - [81, 10666.0] + - - [2048, 512, 1, 3438, 2048, 2048, 2048, 512] + - [114, 10637.0] + - - [2048, 512, 1, 3439, 2048, 2048, 2048, 512] + - [114, 10664.0] + - - [2048, 512, 1, 3440, 2048, 2048, 2048, 512] + - [81, 10646.0] + - - [2048, 512, 1, 3443, 2048, 2048, 2048, 512] + - [114, 10647.0] + - - [2048, 512, 1, 3445, 2048, 2048, 2048, 512] + - [103, 10650.0] + - - [2048, 512, 1, 3447, 2048, 2048, 2048, 512] + - [103, 10646.0] + - - [2048, 512, 1, 3448, 2048, 2048, 2048, 512] + - [103, 10649.0] + - - [2048, 512, 1, 3450, 2048, 2048, 2048, 512] + - [114, 10629.0] + - - [2048, 512, 1, 3451, 2048, 2048, 2048, 512] + - [103, 10664.0] + - - [2048, 512, 1, 3453, 2048, 2048, 2048, 512] + - [103, 10649.0] + - - [2048, 512, 1, 3455, 2048, 2048, 2048, 512] + - [81, 10630.0] + - - [2048, 512, 1, 3456, 2048, 2048, 2048, 512] + - [81, 10664.0] + - - [2048, 512, 1, 3457, 2048, 2048, 2048, 512] + - [103, 10686.0] + - - [2048, 512, 1, 3458, 2048, 2048, 2048, 512] + - [103, 10670.0] + - - [2048, 512, 1, 3459, 2048, 2048, 2048, 512] + - [81, 10662.0] + - - [2048, 512, 1, 3460, 2048, 2048, 2048, 512] + - [114, 10667.0] + - - [2048, 512, 1, 3461, 2048, 2048, 2048, 512] + - [103, 10647.0] + - - [2048, 512, 1, 3462, 2048, 2048, 2048, 512] + - [103, 10679.0] + - - [2048, 512, 1, 3466, 2048, 2048, 2048, 512] + - [81, 10674.0] + - - [2048, 512, 1, 3467, 2048, 2048, 2048, 512] + - [103, 10655.0] + - - [2048, 512, 1, 3468, 2048, 2048, 2048, 512] + - [114, 10637.0] + - - [2048, 512, 1, 3470, 2048, 2048, 2048, 512] + - [114, 10650.0] + - - [2048, 512, 1, 3471, 2048, 2048, 2048, 512] + - [114, 10645.0] + - - [2048, 512, 1, 3476, 2048, 2048, 2048, 512] + - [103, 10688.0] + - - [2048, 512, 1, 3477, 2048, 2048, 2048, 512] + - [94, 10631.0] + - - [2048, 512, 1, 3478, 2048, 2048, 2048, 512] + - [72, 10672.0] + - - [2048, 512, 1, 3479, 2048, 2048, 2048, 512] + - [94, 10654.0] + - - [2048, 512, 1, 3480, 2048, 2048, 2048, 512] + - [103, 10683.0] + - - [2048, 512, 1, 3481, 2048, 2048, 2048, 512] + - [81, 10647.0] + - - [2048, 512, 1, 3483, 2048, 2048, 2048, 512] + - [103, 10659.0] + - - [2048, 512, 1, 3484, 2048, 2048, 2048, 512] + - [81, 10680.0] + - - [2048, 512, 1, 3487, 2048, 2048, 2048, 512] + - [81, 10645.0] + - - [2048, 512, 1, 3489, 2048, 2048, 2048, 512] + - [103, 10669.0] + - - [2048, 512, 1, 3490, 2048, 2048, 2048, 512] + - [94, 10647.0] + - - [2048, 512, 1, 3491, 2048, 2048, 2048, 512] + - [81, 10651.0] + - - [2048, 512, 1, 3493, 2048, 2048, 2048, 512] + - [81, 10670.0] + - - [2048, 512, 1, 3494, 2048, 2048, 2048, 512] + - [103, 10662.0] + - - [2048, 512, 1, 3495, 2048, 2048, 2048, 512] + - [94, 10658.0] + - - [2048, 512, 1, 3497, 2048, 2048, 2048, 512] + - [103, 10668.0] + - - [2048, 512, 1, 3498, 2048, 2048, 2048, 512] + - [81, 10647.0] + - - [2048, 512, 1, 3499, 2048, 2048, 2048, 512] + - [81, 10651.0] + - - [2048, 512, 1, 3501, 2048, 2048, 2048, 512] + - [81, 10672.0] + - - [2048, 512, 1, 3503, 2048, 2048, 2048, 512] + - [81, 10655.0] + - - [2048, 512, 1, 3507, 2048, 2048, 2048, 512] + - [81, 10647.0] + - - [2048, 512, 1, 3508, 2048, 2048, 2048, 512] + - [114, 10662.0] + - - [2048, 512, 1, 3509, 2048, 2048, 2048, 512] + - [81, 10658.0] + - - [2048, 512, 1, 3511, 2048, 2048, 2048, 512] + - [94, 10627.0] + - - [2048, 512, 1, 3514, 2048, 2048, 2048, 512] + - [81, 10674.0] + - - [2048, 512, 1, 3515, 2048, 2048, 2048, 512] + - [103, 10652.0] + - - [2048, 512, 1, 3517, 2048, 2048, 2048, 512] + - [114, 10647.0] + - - [2048, 512, 1, 3518, 2048, 2048, 2048, 512] + - [103, 10647.0] + - - [2048, 512, 1, 3519, 2048, 2048, 2048, 512] + - [103, 10683.0] + - - [2048, 512, 1, 3520, 2048, 2048, 2048, 512] + - [59, 10651.0] + - - [2048, 512, 1, 3523, 2048, 2048, 2048, 512] + - [103, 10667.0] + - - [2048, 512, 1, 3528, 2048, 2048, 2048, 512] + - [103, 10664.0] + - - [2048, 512, 1, 3529, 2048, 2048, 2048, 512] + - [103, 10660.0] + - - [2048, 512, 1, 3530, 2048, 2048, 2048, 512] + - [103, 10654.0] + - - [2048, 512, 1, 3532, 2048, 2048, 2048, 512] + - [81, 10665.0] + - - [2048, 512, 1, 3533, 2048, 2048, 2048, 512] + - [94, 10656.0] + - - [2048, 512, 1, 3534, 2048, 2048, 2048, 512] + - [81, 10638.0] + - - [2048, 512, 1, 3538, 2048, 2048, 2048, 512] + - [114, 10657.0] + - - [2048, 512, 1, 3539, 2048, 2048, 2048, 512] + - [103, 10631.0] + - - [2048, 512, 1, 3541, 2048, 2048, 2048, 512] + - [81, 10645.0] + - - [2048, 512, 1, 3547, 2048, 2048, 2048, 512] + - [94, 10658.0] + - - [2048, 512, 1, 3548, 2048, 2048, 2048, 512] + - [94, 10660.0] + - - [2048, 512, 1, 3552, 2048, 2048, 2048, 512] + - [94, 10663.0] + - - [2048, 512, 1, 3564, 2048, 2048, 2048, 512] + - [94, 10659.0] + - - [2048, 512, 1, 3575, 2048, 2048, 2048, 512] + - [103, 10656.0] + - - [2048, 512, 1, 3598, 2048, 2048, 2048, 512] + - [103, 10673.0] + - - [2048, 512, 1, 3599, 2048, 2048, 2048, 512] + - [81, 10649.0] + - - [2048, 512, 1, 3608, 2048, 2048, 2048, 512] + - [103, 10663.0] + - - [2048, 512, 1, 3780, 2048, 2048, 2048, 512] + - [103, 10646.0] + - - [2048, 512, 1, 3796, 2048, 2048, 2048, 512] + - [81, 10679.0] + - - [2048, 512, 1, 3822, 2048, 2048, 2048, 512] + - [81, 10662.0] + - - [2048, 512, 1, 3840, 2048, 2048, 2048, 512] + - [86, 10591.0] + - - [2048, 512, 1, 3859, 2048, 2048, 2048, 512] + - [81, 10641.0] + - - [2048, 512, 1, 3870, 2048, 2048, 2048, 512] + - [81, 10620.0] + - - [2048, 512, 1, 3876, 2048, 2048, 2048, 512] + - [81, 10626.0] + - - [2048, 512, 1, 3906, 2048, 2048, 2048, 512] + - [81, 10668.0] + - - [2048, 512, 1, 3910, 2048, 2048, 2048, 512] + - [81, 10687.0] + - - [2048, 512, 1, 3925, 2048, 2048, 2048, 512] + - [94, 10652.0] + - - [2048, 512, 1, 3942, 2048, 2048, 2048, 512] + - [81, 10651.0] + - - [2048, 512, 1, 3944, 2048, 2048, 2048, 512] + - [103, 10671.0] + - - [2048, 512, 1, 3955, 2048, 2048, 2048, 512] + - [81, 10677.0] + - - [2048, 512, 1, 3968, 2048, 2048, 2048, 512] + - [103, 10704.0] + - - [2048, 512, 1, 3969, 2048, 2048, 2048, 512] + - [94, 10667.0] + - - [2048, 512, 1, 3976, 2048, 2048, 2048, 512] + - [103, 10702.0] + - - [2048, 512, 1, 3977, 2048, 2048, 2048, 512] + - [114, 10674.0] + - - [2048, 512, 1, 3978, 2048, 2048, 2048, 512] + - [81, 10640.0] + - - [2048, 512, 1, 3990, 2048, 2048, 2048, 512] + - [81, 10671.0] + - - [2048, 512, 1, 3995, 2048, 2048, 2048, 512] + - [103, 10675.0] + - - [2048, 512, 1, 3996, 2048, 2048, 2048, 512] + - [81, 10684.0] + - - [2048, 512, 1, 3999, 2048, 2048, 2048, 512] + - [103, 10657.0] + - - [2048, 512, 1, 4005, 2048, 2048, 2048, 512] + - [81, 10683.0] + - - [2048, 512, 1, 4012, 2048, 2048, 2048, 512] + - [81, 10662.0] + - - [2048, 512, 1, 4020, 2048, 2048, 2048, 512] + - [103, 10686.0] + - - [2048, 512, 1, 4026, 2048, 2048, 2048, 512] + - [81, 10682.0] + - - [2048, 512, 1, 4030, 2048, 2048, 2048, 512] + - [103, 10684.0] + - - [2048, 512, 1, 4032, 2048, 2048, 2048, 512] + - [81, 10676.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [57, 6911.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [79, 7914.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [79, 10445.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 1024, 1024] + - [81, 10687.0] + - - [1024, 1024, 1, 3968, 1024, 1024, 1024, 1024] + - [81, 10653.0] + - - [1024, 1024, 1, 7200, 1024, 1024, 1024, 1024] + - [81, 10711.0] + - - [1024, 1024, 1, 8160, 1024, 1024, 1024, 1024] + - [103, 10715.0] + - - [768, 768, 1, 384, 768, 768, 768, 768] + - [82, 8875.0] + - - [768, 384, 1, 384, 768, 768, 768, 384] + - [72, 7034.0] + - - [1152, 576, 1, 384, 1152, 1152, 1152, 576] + - [59, 8415.0] + - - [384, 768, 1, 384, 384, 384, 384, 768] + - [81, 7025.0] + - - [1024, 1024, 1, 32, 1024, 1024, 1024, 1024] + - [87, 4609.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [79, 10690.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [57, 10939.0] + - - [1024, 1024, 1, 1600, 1024, 1024, 1024, 1024] + - [103, 10525.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 256] + - [82, 8775.0] + - - [256, 1280, 1, 8976, 256, 256, 256, 1280] + - [88, 10309.0] + - - [512, 2048, 1, 256, 512, 512, 512, 2048] + - [59, 9719.0] + - - [560, 1024, 1, 1600, 560, 560, 560, 1024] + - [60, 9721.0] + - - [560, 1024, 1, 200, 560, 560, 560, 1024] + - [60, 7506.0] + - - [1024, 1024, 1, 960, 1024, 1024, 1024, 1024] + - [103, 10410.0] + - - [2304, 128, 1, 128, 2304, 2304, 2304, 128] + - [100, 5033.0] + - - [2688, 128, 1, 128, 2688, 2688, 2688, 128] + - [102, 5193.0] + - - [3072, 128, 1, 128, 3072, 3072, 3072, 128] + - [100, 5471.0] + - - [3456, 128, 1, 128, 3456, 3456, 3456, 128] + - [81, 6062.0] + - - [3840, 128, 1, 128, 3840, 3840, 3840, 128] + - [57, 6473.0] + - - [4224, 128, 1, 128, 4224, 4224, 4224, 128] + - [58, 6303.0] + - - [4608, 128, 1, 128, 4608, 4608, 4608, 128] + - [57, 6765.0] + - - [4992, 128, 1, 128, 4992, 4992, 4992, 128] + - [79, 7329.0] + - - [5376, 128, 1, 128, 5376, 5376, 5376, 128] + - [57, 7046.0] + - - [5760, 128, 1, 128, 5760, 5760, 5760, 128] + - [103, 7538.0] + - - [6144, 128, 1, 128, 6144, 6144, 6144, 128] + - [72, 7877.0] + - - [6528, 128, 1, 128, 6528, 6528, 6528, 128] + - [59, 7717.0] + - - [6912, 128, 1, 128, 6912, 6912, 6912, 128] + - [70, 7975.0] + - - [7296, 128, 1, 128, 7296, 7296, 7296, 128] + - [101, 8478.0] + - - [7680, 128, 1, 128, 7680, 7680, 7680, 128] + - [59, 8654.0] + - - [8064, 128, 1, 128, 8064, 8064, 8064, 128] + - [103, 8480.0] + - - [8448, 128, 1, 128, 8448, 8448, 8448, 128] + - [59, 8794.0] + - - [8832, 128, 1, 128, 8832, 8832, 8832, 128] + - [101, 9112.0] + - - [2304, 128, 1, 256, 2304, 2304, 2304, 128] + - [59, 6178.0] + - - [2688, 128, 1, 256, 2688, 2688, 2688, 128] + - [59, 6000.0] + - - [3072, 128, 1, 256, 3072, 3072, 3072, 128] + - [72, 6693.0] + - - [3456, 128, 1, 256, 3456, 3456, 3456, 128] + - [103, 7500.0] + - - [3840, 128, 1, 256, 3840, 3840, 3840, 128] + - [57, 8171.0] + - - [4224, 128, 1, 256, 4224, 4224, 4224, 128] + - [103, 7690.0] + - - [4608, 128, 1, 256, 4608, 4608, 4608, 128] + - [103, 8144.0] + - - [4992, 128, 1, 256, 4992, 4992, 4992, 128] + - [60, 8880.0] + - - [5376, 128, 1, 256, 5376, 5376, 5376, 128] + - [103, 8325.0] + - - [5760, 128, 1, 256, 5760, 5760, 5760, 128] + - [81, 8771.0] + - - [6144, 128, 1, 256, 6144, 6144, 6144, 128] + - [59, 9201.0] + - - [6528, 128, 1, 256, 6528, 6528, 6528, 128] + - [60, 8781.0] + - - [6912, 128, 1, 256, 6912, 6912, 6912, 128] + - [81, 9155.0] + - - [7296, 128, 1, 256, 7296, 7296, 7296, 128] + - [115, 9617.0] + - - [7680, 128, 1, 256, 7680, 7680, 7680, 128] + - [81, 10002.0] + - - [8064, 128, 1, 256, 8064, 8064, 8064, 128] + - [81, 9498.0] + - - [8448, 128, 1, 256, 8448, 8448, 8448, 128] + - [81, 9706.0] + - - [8832, 128, 1, 256, 8832, 8832, 8832, 128] + - [103, 10270.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [82, 9654.0] + - - [384, 1536, 1, 384, 384, 384, 384, 1536] + - [60, 8931.0] + - - [384, 1920, 1, 384, 384, 384, 384, 1920] + - [81, 9344.0] + - - [384, 2304, 1, 384, 384, 384, 384, 2304] + - [60, 9679.0] + - - [64, 192, 64, 1280, 64, 64, 64, 192] + - [79, 10095.0] + - - [64, 320, 64, 1280, 64, 64, 64, 320] + - [79, 10678.0] + - - [64, 384, 64, 1280, 64, 64, 64, 384] + - [70, 9350.0] + - - [64, 448, 64, 1280, 64, 64, 64, 448] + - [79, 9795.0] + - - [64, 192, 64, 2048, 64, 64, 64, 192] + - [92, 9863.0] + - - [64, 320, 64, 2048, 64, 64, 64, 320] + - [79, 9566.0] + - - [64, 384, 64, 2048, 64, 64, 64, 384] + - [66, 8300.0] + - - [64, 448, 64, 2048, 64, 64, 64, 448] + - [57, 9584.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 64] + - [79, 10807.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 64] + - [79, 10836.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 64] + - [81, 10799.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 80] + - [101, 5897.0] + - - [64, 192, 32, 1280, 64, 64, 64, 192] + - [101, 8035.0] + - - [64, 320, 32, 1280, 64, 64, 64, 320] + - [105, 10316.0] + - - [64, 384, 32, 1280, 64, 64, 64, 384] + - [57, 10117.0] + - - [64, 448, 32, 1280, 64, 64, 64, 448] + - [57, 9904.0] + - - [64, 192, 32, 2048, 64, 64, 64, 192] + - [61, 8167.0] + - - [64, 320, 32, 2048, 64, 64, 64, 320] + - [61, 10543.0] + - - [64, 384, 32, 2048, 64, 64, 64, 384] + - [57, 10040.0] + - - [64, 448, 32, 2048, 64, 64, 64, 448] + - [79, 9956.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 64] + - [57, 10162.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 64] + - [81, 10633.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 64] + - [72, 10666.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 80] + - [101, 6427.0] + - - [289, 128, 32, 768, 289, 289, 289, 128] + - [79, 9847.0] + - - [289, 160, 32, 768, 289, 289, 289, 160] + - [57, 8205.0] + - - [289, 192, 32, 768, 289, 289, 289, 192] + - [57, 9863.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [101, 10124.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 64] + - [57, 10233.0] + - - [196, 256, 32, 1024, 196, 196, 196, 256] + - [94, 8825.0] + - - [1024, 1024, 1, 6912, 1024, 1024, 1024, 1024] + - [81, 10719.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 512] + - [82, 9248.0] + - - [480, 1024, 1, 4096, 480, 480, 480, 1024] + - [60, 8647.0] + - - [1024, 512, 1, 6912, 1024, 1024, 1024, 512] + - [90, 9348.0] + - - [480, 1024, 1, 6912, 480, 480, 480, 1024] + - [65, 8726.0] + - - [100, 512, 120, 128, 100, 100, 100, 512] + - [57, 8301.0] + - - [100, 512, 18, 128, 100, 100, 100, 512] + - [113, 5886.0] + - - [100, 512, 19, 128, 100, 100, 100, 512] + - [101, 6289.0] + - - [1444, 576, 1, 128, 1444, 1444, 1444, 576] + - [72, 7529.0] + - - [173280, 64, 1, 128, 173280, 173280, 173280, 64] + - [69, 7708.0] + - - [25992, 64, 1, 128, 25992, 25992, 25992, 64] + - [79, 8634.0] + - - [27436, 64, 1, 128, 27436, 27436, 27436, 64] + - [92, 8725.0] + - - [361, 2304, 1, 512, 361, 361, 361, 2304] + - [72, 9080.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [101, 10876.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 960] + - [59, 11145.0] + - - [1024, 1024, 1, 77, 1024, 1024, 1024, 1024] + - [57, 6854.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [79, 9383.0] + - - [1024, 1024, 1, 10, 1024, 1024, 1024, 1024] + - [63, 1783.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [57, 11038.0] + - - [1024, 1024, 1, 39, 1024, 1024, 1024, 1024] + - [99, 4690.0] + - - [1024, 1024, 1, 780, 1024, 1024, 1024, 1024] + - [59, 10314.0] + - - [1024, 1024, 1, 4992, 1024, 1024, 1024, 1024] + - [114, 10688.0] + - - [1024, 1024, 1, 308, 1024, 1024, 1024, 1024] + - [59, 9793.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [79, 10998.0] + - - [1024, 1024, 1, 40, 1024, 1024, 1024, 1024] + - [91, 5066.0] + - - [1024, 1024, 1, 800, 1024, 1024, 1024, 1024] + - [103, 10361.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 1024, 1024] + - [103, 10735.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [79, 10853.0] + - - [1024, 1024, 1, 41, 1024, 1024, 1024, 1024] + - [93, 4756.0] + - - [1024, 1024, 1, 820, 1024, 1024, 1024, 1024] + - [72, 10354.0] + - - [1024, 1024, 1, 5248, 1024, 1024, 1024, 1024] + - [103, 10685.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [101, 11089.0] + - - [1024, 1024, 1, 5, 1024, 1024, 1024, 1024] + - [57, 978.0] + - - [1024, 1024, 1, 385, 1024, 1024, 1024, 1024] + - [103, 9988.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 1024, 1024] + - [114, 10594.0] + - - [1024, 1024, 1, 462, 1024, 1024, 1024, 1024] + - [81, 10088.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [79, 8484.0] + - - [1024, 1024, 1, 8, 1024, 1024, 1024, 1024] + - [77, 1762.0] + - - [1024, 1024, 1, 160, 1024, 1024, 1024, 1024] + - [59, 9079.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [101, 8560.0] + - - [1024, 1024, 1, 9, 1024, 1024, 1024, 1024] + - [77, 1661.0] + - - [1024, 1024, 1, 180, 1024, 1024, 1024, 1024] + - [72, 9189.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1024, 1024] + - [103, 10457.0] + - - [1024, 1024, 1, 6528, 1024, 1024, 1024, 1024] + - [103, 10748.0] + - - [1024, 1024, 1, 7104, 1024, 1024, 1024, 1024] + - [81, 10737.0] + - - [1024, 1024, 1, 8064, 1024, 1024, 1024, 1024] + - [114, 10704.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 512] + - [70, 222.0] + - - [1024, 1024, 1, 16, 1024, 1024, 1024, 1024] + - [77, 2923.0] + - - [512, 64, 256, 512, 512, 512, 512, 64] + - [79, 9191.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [79, 9105.0] + - - [512, 64, 128, 512, 512, 512, 512, 64] + - [106, 9043.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [79, 8800.0] + - - [512, 64, 40, 512, 512, 512, 512, 64] + - [103, 10980.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [101, 10713.0] + - - [1024, 96, 64, 1024, 1024, 1024, 1024, 96] + - [88, 9101.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [97, 9122.0] + - - [1024, 96, 128, 1024, 1024, 1024, 1024, 96] + - [76, 9198.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [88, 9222.0] + - - [1024, 64, 256, 1024, 1024, 1024, 1024, 64] + - [95, 10212.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [79, 9999.0] + - - [1024, 64, 32, 1024, 1024, 1024, 1024, 64] + - [72, 10630.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [57, 10361.0] + - - [1024, 64, 64, 1024, 1024, 1024, 1024, 64] + - [101, 10114.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [101, 9928.0] + - - [1024, 64, 128, 1024, 1024, 1024, 1024, 64] + - [81, 10293.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [92, 9234.0] + - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] + - [57, 6779.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [67, 6174.0] + - - [128, 64, 1024, 128, 128, 128, 128, 64] + - [70, 5707.0] + - - [1024, 1024, 1, 3456, 1024, 1024, 1024, 1024] + - [72, 10643.0] + - - [1024, 1024, 1, 864, 1024, 1024, 1024, 1024] + - [81, 10401.0] + - - [1024, 512, 1, 3456, 1024, 1024, 1024, 512] + - [104, 9204.0] + - - [1024, 512, 1, 864, 1024, 1024, 1024, 512] + - [82, 8698.0] + - - [256, 3456, 1, 1, 256, 256, 256, 3456] + - [107, 204.0] + - - [256, 4096, 1, 1, 256, 256, 256, 4096] + - [112, 219.0] + - - [480, 1024, 1, 3456, 480, 480, 480, 1024] + - [82, 8571.0] + - - [480, 1024, 1, 864, 480, 480, 480, 1024] + - [60, 8092.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [89, 6021.0] + - - [128, 64, 1280, 128, 128, 128, 128, 64] + - [102, 5559.0] + - - [1024, 1024, 1, 82, 1024, 1024, 1024, 1024] + - [72, 7250.0] + - - [128, 64, 1312, 128, 128, 128, 128, 64] + - [61, 5541.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [111, 5919.0] + - - [1024, 1024, 1, 12, 1024, 1024, 1024, 1024] + - [73, 2133.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 1024, 1024] + - [114, 10728.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [57, 9117.0] + - - [512, 64, 192, 512, 512, 512, 512, 64] + - [101, 9212.0] + - - [3136, 64, 64, 128, 3136, 3136, 3136, 64] + - [62, 7623.0] + - - [3136, 64, 32, 128, 3136, 3136, 3136, 64] + - [70, 11200.0] + - - [196, 2304, 1, 256, 196, 196, 196, 2304] + - [57, 6169.0] + - - [784, 1152, 1, 128, 784, 784, 784, 1152] + - [57, 7940.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [89, 5810.0] + - - [128, 64, 2048, 128, 128, 128, 128, 64] + - [79, 5484.0] + - - [128, 64, 1536, 128, 128, 128, 128, 64] + - [62, 5528.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [111, 5867.0] + - - [1024, 1024, 1, 96, 1024, 1024, 1024, 1024] + - [59, 7989.0] + - - [92416, 64, 25, 64, 92416, 92416, 92416, 64] + - [117, 5125.0] + - - [50176, 64, 36, 64, 50176, 50176, 50176, 64] + - [102, 5152.0] + - - [36864, 64, 49, 64, 36864, 36864, 36864, 64] + - [59, 5246.0] + - - [25600, 64, 64, 64, 25600, 25600, 25600, 64] + - [86, 5191.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [57, 8924.0] + - - [128, 64, 192, 128, 128, 128, 128, 64] + - [81, 9143.0] + - - [768, 768, 1, 2048, 768, 768, 768, 768] + - [104, 10153.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [57, 10952.0] + - - [384, 64, 144, 384, 384, 384, 384, 64] + - [114, 11323.0] + - - [768, 768, 1, 4608, 768, 768, 768, 768] + - [108, 10354.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [70, 10437.0] + - - [512, 64, 48, 512, 512, 512, 512, 64] + - [103, 10746.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [57, 9957.0] + - - [128, 64, 256, 128, 128, 128, 128, 64] + - [72, 9876.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [96, 8197.0] + - - [384, 64, 192, 384, 384, 384, 384, 64] + - [114, 8588.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 1024, 1024] + - [81, 10695.0] + - - [768, 512, 2, 2048, 768, 768, 768, 512] + - [59, 10750.0] + - - [713, 512, 2, 2048, 713, 713, 713, 512] + - [103, 9905.0] + - - [672, 512, 2, 2048, 672, 672, 672, 512] + - [59, 9348.0] + - - [660, 512, 2, 2048, 660, 660, 660, 512] + - [94, 9145.0] + - - [726, 512, 2, 2048, 726, 726, 726, 512] + - [59, 10102.0] + - - [1008, 512, 2, 2048, 1008, 1008, 1008, 512] + - [103, 10354.0] + - - [748, 512, 2, 2048, 748, 748, 748, 512] + - [59, 10369.0] + - - [864, 512, 2, 2048, 864, 864, 864, 512] + - [65, 10448.0] + - - [888, 512, 2, 2048, 888, 888, 888, 512] + - [65, 10701.0] + - - [805, 512, 2, 2048, 805, 805, 805, 512] + - [97, 9739.0] + - - [850, 512, 2, 2048, 850, 850, 850, 512] + - [65, 10259.0] + - - [840, 512, 2, 2048, 840, 840, 840, 512] + - [88, 10143.0] + - - [850, 256, 2, 3, 850, 850, 850, 256] + - [87, 427.0] + - - [805, 256, 2, 12, 805, 805, 805, 256] + - [99, 1389.0] + - - [805, 256, 2, 3, 805, 805, 805, 256] + - [107, 409.0] + - - [850, 256, 2, 12, 850, 850, 850, 256] + - [99, 1451.0] + - - [768, 256, 2, 12, 768, 768, 768, 256] + - [99, 1447.0] + - - [864, 256, 2, 3, 864, 864, 864, 256] + - [99, 495.0] + - - [950, 256, 2, 12, 950, 950, 950, 256] + - [64, 1504.0] + - - [864, 256, 2, 12, 864, 864, 864, 256] + - [99, 1649.0] + - - [950, 256, 2, 3, 950, 950, 950, 256] + - [64, 434.0] + - - [768, 256, 2, 3, 768, 768, 768, 256] + - [77, 415.0] + - - [1024, 320, 1, 1024, 1024, 1024, 1024, 320] + - [59, 9079.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [76, 9330.0] + - - [1024, 96, 160, 1024, 1024, 1024, 1024, 96] + - [65, 9321.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [88, 9170.0] + - - [1024, 96, 40, 1024, 1024, 1024, 1024, 96] + - [88, 9084.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [88, 9297.0] + - - [1024, 96, 80, 1024, 1024, 1024, 1024, 96] + - [108, 9248.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [88, 9187.0] + - - [1024, 96, 96, 1024, 1024, 1024, 1024, 96] + - [108, 9153.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [76, 8741.0] + - - [1024, 96, 24, 1024, 1024, 1024, 1024, 96] + - [65, 8651.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [59, 8902.0] + - - [1024, 96, 48, 1024, 1024, 1024, 1024, 96] + - [59, 8914.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [59, 8651.0] + - - [1024, 96, 16, 1024, 1024, 1024, 1024, 96] + - [59, 8636.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [116, 8983.0] + - - [1024, 96, 32, 1024, 1024, 1024, 1024, 96] + - [108, 8903.0] + - - [512, 64, 320, 512, 512, 512, 512, 64] + - [101, 9185.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [101, 9021.0] + - - [512, 64, 80, 512, 512, 512, 512, 64] + - [59, 11452.0] + - - [1024, 64, 512, 1024, 1024, 1024, 1024, 64] + - [103, 10215.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [101, 9840.0] + - - [64, 64, 64, 13216, 64, 64, 64, 64] + - [70, 5947.0] + - - [64, 96, 36, 10368, 64, 64, 64, 96] + - [73, 6517.0] + - - [64, 64, 36, 12544, 64, 64, 64, 64] + - [122, 6032.0] + - - [64, 64, 36, 11552, 64, 64, 64, 64] + - [127, 6155.0] + - - [1024, 256, 1, 10496, 1024, 1024, 1024, 256] + - [123, 10218.0] + - - [1024, 256, 1, 11520, 1024, 1024, 1024, 256] + - [123, 10266.0] + - - [1024, 256, 1, 12032, 1024, 1024, 1024, 256] + - [123, 10293.0] + - - [1024, 256, 1, 13568, 1024, 1024, 1024, 256] + - [125, 10362.0] + - - [1024, 256, 1, 14336, 1024, 1024, 1024, 256] + - [125, 10347.0] + - - [1024, 256, 1, 14848, 1024, 1024, 1024, 256] + - [119, 10304.0] + - - [1024, 256, 1, 15104, 1024, 1024, 1024, 256] + - [123, 10373.0] + - - [1024, 256, 1, 15872, 1024, 1024, 1024, 256] + - [119, 10387.0] + - - [1024, 256, 1, 16128, 1024, 1024, 1024, 256] + - [121, 10375.0] + - - [1024, 256, 1, 17152, 1024, 1024, 1024, 256] + - [126, 10363.0] + - - [1024, 256, 1, 17408, 1024, 1024, 1024, 256] + - [126, 10376.0] + - - [1024, 256, 1, 18944, 1024, 1024, 1024, 256] + - [124, 10388.0] + - - [1024, 256, 1, 19712, 1024, 1024, 1024, 256] + - [126, 10409.0] + - - [1024, 256, 1, 19968, 1024, 1024, 1024, 256] + - [124, 10417.0] + - - [1024, 256, 1, 8192, 1024, 1024, 1024, 256] + - [125, 10041.0] + - - [1024, 256, 1, 8448, 1024, 1024, 1024, 256] + - [125, 10076.0] + - - [1024, 256, 1, 9728, 1024, 1024, 1024, 256] + - [123, 10184.0] + - - [1024, 256, 1, 9984, 1024, 1024, 1024, 256] + - [125, 10176.0] + - - [512, 256, 1, 32768, 512, 512, 512, 256] + - [128, 9134.0] + - - [256, 128, 1, 55296, 256, 256, 256, 128] + - [120, 6241.0] + - - [512, 512, 1, 200, 512, 512, 512, 512] + - [181, 4504.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [165, 2954.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 64] + - [217, 5735.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 64] + - [181, 2262.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 64] + - [214, 4114.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 64] + - [132, 4269.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 64] + - [132, 5168.0] + - - [704, 128, 1, 1280, 704, 704, 704, 128] + - [162, 4096.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 64] + - [143, 6086.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 64] + - [138, 4906.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3584] + - [145, 5893.0] + - - [704, 256, 1, 128, 704, 704, 704, 256] + - [132, 3443.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [164, 3474.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 64] + - [206, 5755.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [202, 4733.0] + - - [448, 448, 1, 256, 448, 448, 448, 448] + - [164, 4588.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 1024] + - [143, 5041.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1856] + - [132, 5144.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [202, 4726.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 128] + - [181, 4745.0] + - - [448, 256, 1, 3328, 448, 448, 448, 256] + - [143, 5465.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [164, 2943.0] + - - [128, 704, 1, 1280, 128, 128, 128, 704] + - [194, 4216.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 128] + - [145, 6104.0] + - - [64, 2944, 1, 128, 64, 64, 64, 2944] + - [164, 3578.0] + - - [448, 448, 1, 3328, 448, 448, 448, 448] + - [138, 5622.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 128] + - [169, 4875.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1856] + - [208, 5775.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [181, 3774.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [213, 4143.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 1408] + - [194, 4373.0] + - - [128, 1408, 1, 256, 128, 128, 128, 1408] + - [164, 4112.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 64] + - [132, 4222.0] + - - [256, 448, 1, 3328, 256, 256, 256, 448] + - [206, 5451.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 2368] + - [175, 5263.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 64] + - [165, 4136.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [132, 3495.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 256] + - [132, 4132.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 64] + - [196, 3557.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 64] + - [196, 3879.0] + - - [704, 128, 1, 256, 704, 704, 704, 128] + - [196, 3004.0] + - - [448, 256, 1, 1280, 448, 448, 448, 256] + - [132, 4993.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 128] + - [208, 5792.0] + - - [64, 3584, 1, 256, 64, 64, 64, 3584] + - [198, 4616.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 64] + - [132, 3915.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1024] + - [143, 5570.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 64] + - [145, 5605.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 1856] + - [160, 6088.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 2944] + - [214, 5225.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 4288] + - [206, 6098.0] + - - [64, 1856, 1, 256, 64, 64, 64, 1856] + - [196, 3879.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [132, 4127.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 64] + - [165, 3380.0] + - - [64, 1408, 1, 128, 64, 64, 64, 1408] + - [132, 2244.0] + - - [704, 256, 1, 3328, 704, 704, 704, 256] + - [186, 5075.0] + - - [64, 2944, 1, 256, 64, 64, 64, 2944] + - [132, 4130.0] + - - [448, 256, 1, 128, 448, 448, 448, 256] + - [181, 2856.0] + - - [704, 128, 1, 3328, 704, 704, 704, 128] + - [194, 4383.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [149, 2280.0] + - - [256, 448, 1, 1280, 256, 256, 256, 448] + - [164, 4963.0] + - - [704, 256, 1, 1280, 704, 704, 704, 256] + - [202, 4902.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 2368] + - [143, 5738.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 64] + - [181, 2947.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [181, 2227.0] + - - [256, 704, 1, 3328, 256, 256, 256, 704] + - [169, 5072.0] + - - [256, 448, 1, 128, 256, 256, 256, 448] + - [132, 2749.0] + - - [64, 3584, 1, 128, 64, 64, 64, 3584] + - [134, 3989.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 128] + - [132, 3695.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 64] + - [169, 5103.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 1408] + - [138, 5091.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 64] + - [132, 2958.0] + - - [64, 1856, 1, 128, 64, 64, 64, 1856] + - [132, 2947.0] + - - [64, 2368, 1, 256, 64, 64, 64, 2368] + - [198, 4163.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 128] + - [143, 5024.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [134, 4259.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 64] + - [143, 5277.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1024] + - [156, 4707.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 4288] + - [158, 5778.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 64] + - [214, 4411.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 2944] + - [202, 5035.0] + - - [256, 704, 1, 128, 256, 256, 256, 704] + - [164, 3443.0] + - - [256, 1024, 1, 128, 256, 256, 256, 1024] + - [164, 4143.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1408] + - [194, 4140.0] + - - [448, 448, 1, 1280, 448, 448, 448, 448] + - [138, 5406.0] + - - [128, 1024, 1, 256, 128, 128, 128, 1024] + - [181, 3704.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 64] + - [208, 5887.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 128] + - [164, 4134.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 1024] + - [206, 5828.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 64] + - [143, 5612.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [132, 3813.0] + - - [128, 704, 1, 256, 128, 128, 128, 704] + - [149, 2988.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 3584] + - [208, 5605.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 64] + - [165, 4573.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 1856] + - [143, 5630.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 128] + - [169, 5083.0] + - - [128, 704, 1, 3328, 128, 128, 128, 704] + - [216, 4489.0] + - - [128, 1856, 1, 256, 128, 128, 128, 1856] + - [198, 4766.0] + - - [64, 4288, 1, 256, 64, 64, 64, 4288] + - [202, 4872.0] + - - [256, 704, 1, 1280, 256, 256, 256, 704] + - [202, 4942.0] + - - [64, 2368, 1, 128, 64, 64, 64, 2368] + - [134, 3356.0] + - - [64, 4288, 1, 128, 64, 64, 64, 4288] + - [164, 4202.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 128] + - [134, 4928.0] + - - [64, 1408, 1, 256, 64, 64, 64, 1408] + - [162, 3004.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 64] + - [169, 5295.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1408] + - [154, 4902.0] + - - [448, 448, 1, 128, 448, 448, 448, 448] + - [132, 3823.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [196, 4149.0] + - - [49, 512, 128, 2048, 49, 49, 49, 512] + - [218, 5175.0] + - - [49, 2048, 128, 512, 49, 49, 49, 2048] + - [160, 5251.0] + - - [49, 2048, 256, 512, 49, 49, 49, 2048] + - [218, 5258.0] + - - [49, 512, 256, 2048, 49, 49, 49, 512] + - [218, 5242.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [132, 3622.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [208, 5595.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [214, 4989.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [132, 5134.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [208, 4589.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [138, 4292.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [181, 3263.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [202, 3858.0] + - - [64, 64, 36, 3136, 64, 64, 64, 64] + - [158, 5538.0] + - - [64, 64, 64, 826, 64, 64, 64, 64] + - [202, 5271.0] + - - [64, 64, 64, 1600, 64, 64, 64, 64] + - [191, 5627.0] + - - [64, 96, 64, 288, 64, 64, 64, 96] + - [169, 5337.0] + - - [96, 96, 36, 1568, 96, 96, 96, 96] + - [136, 5732.0] + - - [96, 96, 36, 2592, 96, 96, 96, 96] + - [184, 5810.0] + - - [64, 96, 64, 800, 64, 64, 64, 96] + - [138, 5884.0] + - - [35, 96, 36, 8960, 35, 35, 35, 96] + - [191, 3031.0] + - - [32, 64, 36, 43808, 32, 32, 32, 64] + - [156, 3773.0] + - - [64, 64, 64, 81, 64, 64, 64, 64] + - [132, 3481.0] + - - [64, 96, 36, 512, 64, 64, 64, 96] + - [154, 4778.0] + - - [64, 64, 64, 3200, 64, 64, 64, 64] + - [158, 5796.0] + - - [64, 64, 36, 3520, 64, 64, 64, 64] + - [158, 5554.0] + - - [64, 64, 64, 5408, 64, 64, 64, 64] + - [154, 5457.0] + - - [35, 96, 36, 13440, 35, 35, 35, 96] + - [143, 3039.0] + - - [96, 96, 64, 1152, 96, 96, 96, 96] + - [189, 5811.0] + - - [32, 64, 36, 90, 32, 32, 32, 64] + - [163, 1634.0] + - - [64, 64, 64, 800, 64, 64, 64, 64] + - [154, 5248.0] + - - [64, 64, 36, 1568, 64, 64, 64, 64] + - [143, 5226.0] + - - [64, 64, 36, 196, 64, 64, 64, 64] + - [150, 3705.0] + - - [35, 96, 64, 4235, 35, 35, 35, 96] + - [191, 3311.0] + - - [149, 32, 36, 19072, 149, 149, 149, 32] + - [200, 4626.0] + - - [64, 96, 36, 1568, 64, 64, 64, 96] + - [143, 5285.0] + - - [96, 96, 64, 800, 96, 96, 96, 96] + - [173, 5549.0] + - - [32, 64, 64, 640, 32, 32, 32, 64] + - [200, 4025.0] + - - [64, 64, 36, 392, 64, 64, 64, 64] + - [182, 4333.0] + - - [64, 64, 64, 1652, 64, 64, 64, 64] + - [191, 5640.0] + - - [64, 96, 36, 2592, 64, 64, 64, 96] + - [158, 5456.0] + - - [64, 64, 36, 6272, 64, 64, 64, 64] + - [191, 5752.0] + - - [32, 64, 64, 20000, 32, 32, 32, 64] + - [210, 3785.0] + - - [64, 64, 64, 648, 64, 64, 64, 64] + - [169, 5157.0] + - - [32, 64, 36, 1440, 32, 32, 32, 64] + - [200, 4037.0] + - - [64, 64, 64, 100, 64, 64, 64, 64] + - [132, 3703.0] + - - [64, 96, 64, 4608, 64, 64, 64, 96] + - [158, 5963.0] + - - [64, 64, 64, 200, 64, 64, 64, 64] + - [132, 4413.0] + - - [32, 64, 64, 40, 32, 32, 32, 64] + - [146, 1670.0] + - - [64, 96, 64, 1152, 64, 64, 64, 96] + - [214, 6000.0] + - - [149, 32, 64, 8195, 149, 149, 149, 32] + - [130, 4756.0] + - - [35, 96, 64, 6160, 35, 35, 35, 96] + - [206, 3355.0] + - - [64, 64, 36, 1760, 64, 64, 64, 64] + - [175, 5286.0] + - - [64, 2880, 1, 320, 64, 64, 64, 2880] + - [132, 4225.0] + - - [49, 832, 32, 256, 49, 49, 49, 832] + - [193, 4874.0] + - - [289, 1120, 1, 160, 289, 289, 289, 1120] + - [138, 4591.0] + - - [64, 1728, 1, 320, 64, 64, 64, 1728] + - [149, 3838.0] + - - [49, 832, 32, 160, 49, 49, 49, 832] + - [138, 4742.0] + - - [49, 832, 32, 384, 49, 49, 49, 832] + - [145, 4974.0] + - - [289, 896, 1, 192, 289, 289, 289, 896] + - [132, 4431.0] + - - [289, 896, 1, 128, 289, 289, 289, 896] + - [132, 4062.0] + - - [196, 800, 1, 64, 196, 196, 196, 800] + - [132, 2091.0] + - - [64, 1344, 1, 512, 64, 64, 64, 1344] + - [164, 3518.0] + - - [64, 1152, 1, 384, 64, 64, 64, 1152] + - [132, 2943.0] + - - [64, 1152, 1, 448, 64, 64, 64, 1152] + - [152, 3075.0] + - - [49, 832, 32, 128, 49, 49, 49, 832] + - [140, 4677.0] + - - [49, 832, 32, 48, 49, 49, 49, 832] + - [194, 4071.0] + - - [64, 1152, 1, 256, 64, 64, 64, 1152] + - [147, 2564.0] + - - [49, 832, 32, 32, 49, 49, 49, 832] + - [181, 3491.0] + - - [289, 1120, 1, 192, 289, 289, 289, 1120] + - [202, 4701.0] + - - [196, 600, 1, 64, 196, 196, 196, 600] + - [135, 1836.0] + - - [49, 832, 32, 192, 49, 49, 49, 832] + - [202, 4806.0] + - - [64, 1728, 1, 192, 64, 64, 64, 1728] + - [132, 3349.0] + - - [64, 38, 840, 38, 64, 64, 64, 38] + - [132, 3463.0] + - - [64, 49, 648, 49, 64, 64, 64, 49] + - [196, 4371.0] + - - [64, 32, 992, 32, 64, 64, 64, 32] + - [186, 4294.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [132, 3139.0] + - - [64, 41, 776, 41, 64, 64, 64, 41] + - [132, 3668.0] + - - [64, 45, 712, 45, 64, 64, 64, 45] + - [132, 4076.0] + - - [64, 54, 592, 54, 64, 64, 64, 54] + - [132, 4863.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [132, 5302.0] + - - [49, 512, 64, 2048, 49, 49, 49, 512] + - [208, 5121.0] + - - [49, 2048, 64, 512, 49, 49, 49, 2048] + - [218, 5194.0] + - - [33, 32, 1600, 33, 33, 33, 33, 32] + - [132, 2755.0] + - - [33, 32, 200, 33, 33, 33, 33, 32] + - [132, 1361.0] + - - [67, 2048, 1, 512, 67, 67, 67, 2048] + - [136, 3342.0] + - - [512, 512, 1, 3780, 512, 512, 512, 512] + - [143, 5881.0] + - - [512, 512, 1, 3796, 512, 512, 512, 512] + - [206, 5869.0] + - - [512, 512, 1, 3822, 512, 512, 512, 512] + - [143, 5861.0] + - - [512, 512, 1, 3840, 512, 512, 512, 512] + - [143, 5867.0] + - - [512, 512, 1, 3859, 512, 512, 512, 512] + - [143, 5858.0] + - - [512, 512, 1, 3870, 512, 512, 512, 512] + - [206, 5861.0] + - - [512, 512, 1, 3876, 512, 512, 512, 512] + - [143, 5870.0] + - - [512, 512, 1, 3906, 512, 512, 512, 512] + - [175, 5873.0] + - - [512, 512, 1, 3910, 512, 512, 512, 512] + - [143, 5876.0] + - - [512, 512, 1, 3925, 512, 512, 512, 512] + - [206, 5876.0] + - - [512, 512, 1, 3927, 512, 512, 512, 512] + - [206, 5870.0] + - - [512, 512, 1, 3942, 512, 512, 512, 512] + - [143, 5866.0] + - - [512, 512, 1, 3944, 512, 512, 512, 512] + - [143, 5879.0] + - - [512, 512, 1, 3955, 512, 512, 512, 512] + - [143, 5869.0] + - - [512, 512, 1, 3968, 512, 512, 512, 512] + - [175, 5874.0] + - - [512, 512, 1, 3969, 512, 512, 512, 512] + - [175, 5862.0] + - - [512, 512, 1, 3976, 512, 512, 512, 512] + - [143, 5876.0] + - - [512, 512, 1, 3977, 512, 512, 512, 512] + - [143, 5870.0] + - - [512, 512, 1, 3978, 512, 512, 512, 512] + - [143, 5866.0] + - - [512, 512, 1, 3990, 512, 512, 512, 512] + - [175, 5879.0] + - - [512, 512, 1, 3995, 512, 512, 512, 512] + - [143, 5880.0] + - - [512, 512, 1, 3996, 512, 512, 512, 512] + - [175, 5877.0] + - - [512, 512, 1, 3999, 512, 512, 512, 512] + - [143, 5864.0] + - - [512, 512, 1, 4005, 512, 512, 512, 512] + - [175, 5873.0] + - - [512, 512, 1, 4012, 512, 512, 512, 512] + - [206, 5872.0] + - - [512, 512, 1, 4020, 512, 512, 512, 512] + - [143, 5875.0] + - - [512, 512, 1, 4026, 512, 512, 512, 512] + - [143, 5858.0] + - - [512, 512, 1, 4030, 512, 512, 512, 512] + - [175, 5867.0] + - - [512, 512, 1, 4032, 512, 512, 512, 512] + - [143, 5881.0] + - - [512, 512, 1, 4050, 512, 512, 512, 512] + - [206, 5860.0] + - - [512, 512, 1, 4059, 512, 512, 512, 512] + - [206, 5869.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [165, 4480.0] + - - [384, 192, 1, 384, 384, 384, 384, 192] + - [212, 3160.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 256] + - [143, 5397.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 256] + - [143, 5543.0] + - - [1024, 256, 1, 2304, 1024, 1024, 1024, 256] + - [143, 5754.0] + - - [1024, 256, 1, 2816, 1024, 1024, 1024, 256] + - [143, 5802.0] + - - [1024, 256, 1, 3072, 1024, 1024, 1024, 256] + - [143, 5820.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 256] + - [206, 5839.0] + - - [1024, 256, 1, 3584, 1024, 1024, 1024, 256] + - [175, 5854.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 256] + - [143, 5880.0] + - - [1024, 256, 1, 4352, 1024, 1024, 1024, 256] + - [206, 5879.0] + - - [1024, 256, 1, 4608, 1024, 1024, 1024, 256] + - [143, 5898.0] + - - [1024, 256, 1, 5120, 1024, 1024, 1024, 256] + - [143, 5910.0] + - - [1024, 256, 1, 5376, 1024, 1024, 1024, 256] + - [143, 5921.0] + - - [1024, 256, 1, 5632, 1024, 1024, 1024, 256] + - [158, 5925.0] + - - [1024, 256, 1, 6144, 1024, 1024, 1024, 256] + - [143, 5940.0] + - - [1024, 256, 1, 6400, 1024, 1024, 1024, 256] + - [175, 5954.0] + - - [1024, 256, 1, 7680, 1024, 1024, 1024, 256] + - [143, 5973.0] + - - [1024, 256, 1, 7936, 1024, 1024, 1024, 256] + - [175, 5961.0] + - - [512, 512, 1, 1600, 512, 512, 512, 512] + - [206, 5647.0] + - - [100, 2048, 1, 512, 100, 100, 100, 2048] + - [138, 4033.0] + - - [74, 2048, 1, 512, 74, 74, 74, 2048] + - [130, 3664.0] + - - [74, 2048, 1, 960, 74, 74, 74, 2048] + - [167, 3839.0] + - - [768, 128, 1, 128, 768, 768, 768, 128] + - [196, 2411.0] + - - [1152, 128, 1, 128, 1152, 1152, 1152, 128] + - [198, 3265.0] + - - [1536, 128, 1, 128, 1536, 1536, 1536, 128] + - [181, 3723.0] + - - [1920, 128, 1, 128, 1920, 1920, 1920, 128] + - [134, 4369.0] + - - [768, 128, 1, 256, 768, 768, 768, 128] + - [164, 3243.0] + - - [1152, 128, 1, 256, 1152, 1152, 1152, 128] + - [134, 4139.0] + - - [1536, 128, 1, 256, 1536, 1536, 1536, 128] + - [132, 4494.0] + - - [1920, 128, 1, 256, 1920, 1920, 1920, 128] + - [134, 5025.0] + - - [448, 448, 1, 448, 448, 448, 448, 448] + - [164, 4973.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 32] + - [169, 6138.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 48] + - [154, 4753.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 48] + - [208, 4809.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 48] + - [145, 4805.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 32] + - [138, 5871.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 48] + - [202, 4635.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 48] + - [145, 4698.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 48] + - [208, 4695.0] + - - [49, 2048, 32, 512, 49, 49, 49, 2048] + - [169, 5109.0] + - - [49, 512, 32, 2048, 49, 49, 49, 512] + - [177, 5066.0] + - - [512, 256, 1, 4096, 512, 512, 512, 256] + - [217, 5067.0] + - - [512, 256, 1, 6912, 512, 512, 512, 256] + - [173, 5373.0] + - - [100, 2304, 1, 512, 100, 100, 100, 2304] + - [208, 4099.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [130, 5132.0] + - - [512, 480, 1, 512, 512, 512, 512, 480] + - [138, 5368.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [138, 5135.0] + - - [32, 64, 4608, 32, 32, 32, 32, 64] + - [210, 4765.0] + - - [32, 64, 4608, 35, 32, 32, 32, 64] + - [162, 4915.0] + - - [34, 64, 4736, 24, 34, 34, 34, 64] + - [187, 3228.0] + - - [34, 64, 4736, 34, 34, 34, 34, 64] + - [194, 3285.0] + - - [35, 64, 4608, 35, 35, 35, 35, 64] + - [179, 3349.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [138, 4931.0] + - - [64, 32, 4608, 35, 64, 64, 64, 32] + - [164, 5385.0] + - - [64, 34, 4736, 24, 64, 64, 64, 34] + - [132, 3356.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [164, 3326.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [132, 3431.0] + - - [256, 864, 1, 1, 256, 256, 256, 864] + - [170, 104.0] + - - [512, 256, 1, 3456, 512, 512, 512, 256] + - [175, 5028.0] + - - [512, 256, 1, 864, 512, 512, 512, 256] + - [134, 4407.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 1024] + - [145, 5169.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 2048] + - [145, 5231.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 1024] + - [208, 5120.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 2048] + - [186, 5171.0] + - - [49, 4608, 1, 512, 49, 49, 49, 4608] + - [145, 3978.0] + - - [56, 512, 64, 512, 56, 56, 56, 512] + - [145, 5741.0] + - - [228, 256, 2, 12, 228, 228, 228, 256] + - [203, 594.0] + - - [228, 256, 2, 3, 228, 228, 228, 256] + - [135, 179.0] + - - [187, 256, 2, 12, 187, 187, 187, 256] + - [131, 532.0] + - - [247, 256, 2, 12, 247, 247, 247, 256] + - [131, 632.0] + - - [176, 256, 2, 3, 176, 176, 176, 256] + - [170, 161.0] + - - [187, 256, 2, 3, 187, 187, 187, 256] + - [170, 156.0] + - - [221, 256, 2, 3, 221, 221, 221, 256] + - [139, 177.0] + - - [221, 256, 2, 12, 221, 221, 221, 256] + - [137, 575.0] + - - [176, 256, 2, 12, 176, 176, 176, 256] + - [139, 535.0] + - - [247, 256, 2, 3, 247, 247, 247, 256] + - [144, 186.0] + - - [216, 256, 2, 3, 216, 216, 216, 256] + - [170, 180.0] + - - [192, 256, 2, 12, 192, 192, 192, 256] + - [140, 590.0] + - - [192, 256, 2, 3, 192, 192, 192, 256] + - [203, 173.0] + - - [216, 256, 2, 12, 216, 216, 216, 256] + - [131, 598.0] + - - [32, 32, 36, 43808, 32, 32, 32, 32] + - [223, 2997.0] + - - [32, 32, 64, 20000, 32, 32, 32, 32] + - [156, 2984.0] + - - [256, 128, 1, 32768, 256, 256, 256, 128] + - [230, 5119.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 4] + - [252, 1078.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 4] + - [238, 527.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 4] + - [241, 799.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 4] + - [246, 1570.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 4] + - [248, 629.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 4] + - [249, 357.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 4] + - [245, 433.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 4] + - [243, 725.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 4] + - [247, 1248.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 4] + - [236, 704.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 4] + - [137, 439.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 4] + - [153, 266.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 4] + - [240, 1322.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 4] + - [241, 559.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 4] + - [238, 762.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 4] + - [250, 1611.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 4] + - [236, 608.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 4] + - [240, 1384.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 4] + - [240, 1044.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 4] + - [249, 290.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 4] + - [238, 232.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 4] + - [241, 483.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 4] + - [238, 1085.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 4] + - [236, 501.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 4] + - [248, 728.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 4] + - [239, 618.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 4] + - [248, 911.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 4] + - [243, 1496.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 4] + - [240, 1185.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 4] + - [137, 347.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 4] + - [137, 178.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 4] + - [243, 835.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 4] + - [250, 1466.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 4] + - [242, 906.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 4] + - [238, 954.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 4] + - [253, 1702.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 1] + - [244, 126.0] + - - [2048, 1, 1, 960, 2048, 2048, 2048, 1] + - [251, 159.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [237, 5.0] + - - [2560, 2, 1, 4, 2560, 2560, 2560, 2] + - [131, 12.0] + - - [2048, 2, 1, 8, 2048, 2048, 2048, 2] + - [168, 19.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [236, 6.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 1856] + - [257, 735.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 2944] + - [266, 867.0] + - - [4, 1408, 1, 128, 4, 4, 4, 1408] + - [146, 230.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 2368] + - [268, 855.0] + - - [4, 3584, 1, 128, 4, 4, 4, 3584] + - [255, 527.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 5888] + - [264, 1012.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 1408] + - [263, 582.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 6784] + - [269, 997.0] + - - [4, 4288, 1, 128, 4, 4, 4, 4288] + - [265, 600.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 5056] + - [263, 1159.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 6784] + - [266, 953.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 2944] + - [263, 926.0] + - - [4, 5056, 1, 256, 4, 4, 4, 5056] + - [270, 827.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 5056] + - [259, 1109.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 2368] + - [263, 941.0] + - - [4, 1856, 1, 256, 4, 4, 4, 1856] + - [183, 413.0] + - - [4, 2368, 1, 256, 4, 4, 4, 2368] + - [255, 495.0] + - - [4, 2944, 1, 256, 4, 4, 4, 2944] + - [255, 601.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 4288] + - [268, 1008.0] + - - [4, 6784, 1, 128, 4, 4, 4, 6784] + - [267, 742.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 3584] + - [257, 976.0] + - - [4, 5888, 1, 256, 4, 4, 4, 5888] + - [267, 871.0] + - - [4, 6784, 1, 256, 4, 4, 4, 6784] + - [261, 835.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1408] + - [268, 526.0] + - - [4, 3584, 1, 256, 4, 4, 4, 3584] + - [255, 725.0] + - - [4, 1408, 1, 256, 4, 4, 4, 1408] + - [258, 315.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 4288] + - [263, 1047.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 5888] + - [267, 997.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1856] + - [268, 624.0] + - - [4, 1856, 1, 128, 4, 4, 4, 1856] + - [178, 297.0] + - - [4, 2944, 1, 128, 4, 4, 4, 2944] + - [161, 433.0] + - - [4, 5056, 1, 128, 4, 4, 4, 5056] + - [255, 703.0] + - - [4, 4288, 1, 256, 4, 4, 4, 4288] + - [255, 754.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3584] + - [268, 1043.0] + - - [4, 5888, 1, 128, 4, 4, 4, 5888] + - [260, 743.0] + - - [4, 2368, 1, 128, 4, 4, 4, 2368] + - [135, 370.0] + - - [49, 1200, 1, 128, 49, 49, 49, 1200] + - [254, 1863.0] + - - [1, 1152, 1, 256, 1, 1, 1, 1152] + - [256, 66.0] + - - [25, 1152, 1, 256, 25, 25, 25, 1152] + - [155, 1536.0] + - - [9, 1152, 1, 256, 9, 9, 9, 1152] + - [262, 609.0] + - - [16, 32, 36, 5760, 16, 16, 16, 32] + - [221, 2329.0] + - - [3, 64, 36, 6272, 3, 3, 3, 64] + - [226, 527.0] + - - [3, 64, 64, 46208, 3, 3, 3, 64] + - [141, 490.0] + - - [3, 64, 64, 92416, 3, 3, 3, 64] + - [205, 488.0] + - - [1, 16, 36, 23040, 1, 1, 1, 16] + - [219, 155.0] + - - [1, 16, 64, 10240, 1, 1, 1, 16] + - [229, 162.0] + - - [3, 64, 36, 25088, 3, 3, 3, 64] + - [215, 468.0] + - - [3, 64, 64, 11552, 3, 3, 3, 64] + - [234, 498.0] + - - [3, 64, 36, 200704, 3, 3, 3, 64] + - [172, 473.0] + - - [3, 64, 64, 23104, 3, 3, 3, 64] + - [228, 482.0] + - - [3, 64, 36, 100352, 3, 3, 3, 64] + - [205, 479.0] + - - [3, 64, 36, 50176, 3, 3, 3, 64] + - [188, 477.0] + - - [8, 384, 64, 6600, 8, 8, 8, 384] + - [225, 1341.0] + - - [65, 1024, 1, 6400, 65, 65, 65, 1024] + - [220, 3777.0] + - - [13, 512, 1, 32768, 13, 13, 13, 512] + - [235, 1917.0] + - - [256, 1, 1, 32768, 256, 256, 256, 1] + - [222, 111.0] + - - [256, 4, 1, 6912, 256, 256, 256, 4] + - [232, 353.0] + - - [13, 512, 1, 55296, 13, 13, 13, 512] + - [231, 1891.0] + - - [1024, 2, 1, 4992, 1024, 1024, 1024, 2] + - [224, 306.0] + - - [1024, 2, 1, 5120, 1024, 1024, 1024, 2] + - [227, 315.0] + - - [1024, 2, 1, 5248, 1024, 1024, 1024, 2] + - [224, 317.0] + - - [13, 512, 1, 6912, 13, 13, 13, 512] + - [219, 1636.0] + - - [256, 1, 1, 6912, 256, 256, 256, 1] + - [222, 88.0] + - - [256, 128, 1, 6912, 256, 256, 256, 128] + - [233, 4459.0] + - - [768, 2, 1, 4608, 768, 768, 768, 2] + - [227, 301.0] + - - [1024, 2, 1, 4608, 1024, 1024, 1024, 2] + - [222, 311.0] + - - [1024, 64, 1, 512, 1024, 1024, 1024, 64] + - [154, 2938.0] + - - [512, 32, 1, 200, 512, 512, 512, 32] + - [139, 858.0] + - - [4, 704, 1, 1280, 4, 4, 4, 704] + - [172, 264.0] + - - [128, 64, 1, 256, 128, 128, 128, 64] + - [139, 546.0] + - - [64, 4, 1, 256, 64, 64, 64, 4] + - [139, 16.0] + - - [64, 704, 1, 128, 64, 64, 64, 704] + - [199, 1716.0] + - - [448, 64, 1, 1280, 448, 448, 448, 64] + - [201, 2440.0] + - - [128, 4, 1, 1280, 128, 128, 128, 4] + - [207, 52.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1024] + - [140, 3905.0] + - - [64, 704, 1, 1280, 64, 64, 64, 704] + - [140, 2713.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 64] + - [174, 2097.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 64] + - [138, 3898.0] + - - [4, 704, 1, 256, 4, 4, 4, 704] + - [185, 171.0] + - - [704, 4, 1, 1280, 704, 704, 704, 4] + - [142, 280.0] + - - [64, 448, 1, 256, 64, 64, 64, 448] + - [201, 1748.0] + - - [64, 1024, 1, 128, 64, 64, 64, 1024] + - [142, 2118.0] + - - [4, 64, 1, 1280, 4, 4, 4, 64] + - [142, 26.0] + - - [128, 256, 1, 3328, 128, 128, 128, 256] + - [199, 2407.0] + - - [64, 448, 1, 1280, 64, 64, 64, 448] + - [142, 2434.0] + - - [448, 4, 1, 256, 448, 448, 448, 4] + - [203, 102.0] + - - [448, 4, 1, 1280, 448, 448, 448, 4] + - [144, 183.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [139, 21.0] + - - [256, 4, 1, 128, 256, 256, 256, 4] + - [139, 43.0] + - - [704, 64, 1, 3328, 704, 704, 704, 64] + - [204, 2937.0] + - - [64, 128, 1, 256, 64, 64, 64, 128] + - [170, 543.0] + - - [704, 64, 1, 128, 704, 704, 704, 64] + - [163, 1716.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 4] + - [185, 245.0] + - - [256, 256, 1, 128, 256, 256, 256, 256] + - [142, 2007.0] + - - [64, 256, 1, 128, 64, 64, 64, 256] + - [148, 599.0] + - - [704, 64, 1, 1280, 704, 704, 704, 64] + - [140, 2568.0] + - - [128, 448, 1, 256, 128, 128, 128, 448] + - [197, 2172.0] + - - [512, 32, 1, 512, 512, 512, 512, 32] + - [155, 1202.0] + - - [128, 256, 1, 1280, 128, 128, 128, 256] + - [137, 2322.0] + - - [448, 64, 1, 3328, 448, 448, 448, 64] + - [201, 2545.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [133, 1146.0] + - - [64, 128, 1, 3328, 64, 64, 64, 128] + - [144, 919.0] + - - [128, 128, 1, 3328, 128, 128, 128, 128] + - [144, 1822.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [170, 1583.0] + - - [64, 448, 1, 3328, 64, 64, 64, 448] + - [201, 2554.0] + - - [256, 256, 1, 3328, 256, 256, 256, 256] + - [187, 4162.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 4] + - [142, 433.0] + - - [4, 4, 1, 256, 4, 4, 4, 4] + - [129, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [139, 958.0] + - - [256, 128, 1, 1280, 256, 256, 256, 128] + - [168, 2335.0] + - - [128, 64, 1, 1280, 128, 128, 128, 64] + - [139, 785.0] + - - [4, 448, 1, 3328, 4, 4, 4, 448] + - [144, 197.0] + - - [64, 1024, 1, 256, 64, 64, 64, 1024] + - [179, 2330.0] + - - [256, 4, 1, 1280, 256, 256, 256, 4] + - [139, 96.0] + - - [64, 704, 1, 256, 64, 64, 64, 704] + - [195, 1929.0] + - - [4, 704, 1, 128, 4, 4, 4, 704] + - [211, 99.0] + - - [512, 16, 1, 512, 512, 512, 512, 16] + - [185, 589.0] + - - [448, 128, 1, 256, 448, 448, 448, 128] + - [179, 2198.0] + - - [448, 64, 1, 128, 448, 448, 448, 64] + - [148, 1025.0] + - - [4, 448, 1, 1280, 4, 4, 4, 448] + - [139, 165.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [181, 2453.0] + - - [256, 64, 1, 128, 256, 256, 256, 64] + - [209, 579.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 1024] + - [205, 396.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 1024] + - [140, 4172.0] + - - [704, 4, 1, 128, 704, 704, 704, 4] + - [146, 97.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [139, 52.0] + - - [256, 4, 1, 3328, 256, 256, 256, 4] + - [144, 113.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [139, 51.0] + - - [4, 4, 1, 128, 4, 4, 4, 4] + - [131, 0.48] + - - [4, 128, 1, 256, 4, 4, 4, 128] + - [139, 25.0] + - - [64, 64, 1, 1280, 64, 64, 64, 64] + - [139, 394.0] + - - [448, 128, 1, 3328, 448, 448, 448, 128] + - [140, 3664.0] + - - [4, 448, 1, 128, 4, 4, 4, 448] + - [146, 63.0] + - - [64, 256, 1, 1280, 64, 64, 64, 256] + - [139, 1572.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 32] + - [203, 1920.0] + - - [4, 128, 1, 3328, 4, 4, 4, 128] + - [144, 56.0] + - - [64, 4, 1, 128, 64, 64, 64, 4] + - [129, 8.0] + - - [64, 64, 1, 256, 64, 64, 64, 64] + - [209, 223.0] + - - [4, 704, 1, 3328, 4, 4, 4, 704] + - [172, 285.0] + - - [4, 4, 1, 1280, 4, 4, 4, 4] + - [129, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [203, 592.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 4] + - [146, 140.0] + - - [64, 64, 1, 3328, 64, 64, 64, 64] + - [207, 460.0] + - - [4, 64, 1, 128, 4, 4, 4, 64] + - [146, 9.0] + - - [64, 128, 1, 1280, 64, 64, 64, 128] + - [139, 786.0] + - - [128, 128, 1, 1280, 128, 128, 128, 128] + - [155, 1574.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [170, 1607.0] + - - [256, 64, 1, 1280, 256, 256, 256, 64] + - [155, 1572.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 4] + - [142, 362.0] + - - [704, 64, 1, 256, 704, 704, 704, 64] + - [163, 1922.0] + - - [128, 448, 1, 1280, 128, 128, 128, 448] + - [187, 3231.0] + - - [128, 64, 1, 3328, 128, 128, 128, 64] + - [144, 923.0] + - - [448, 64, 1, 256, 448, 448, 448, 64] + - [148, 1486.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 16] + - [151, 1155.0] + - - [4, 256, 1, 128, 4, 4, 4, 256] + - [211, 37.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 64] + - [164, 2267.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [211, 308.0] + - - [4, 4, 1, 3328, 4, 4, 4, 4] + - [137, 2.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1024] + - [172, 345.0] + - - [704, 4, 1, 256, 704, 704, 704, 4] + - [203, 142.0] + - - [128, 4, 1, 3328, 128, 128, 128, 4] + - [159, 56.0] + - - [448, 4, 1, 3328, 448, 448, 448, 4] + - [176, 197.0] + - - [704, 4, 1, 3328, 704, 704, 704, 4] + - [142, 299.0] + - - [448, 128, 1, 1280, 448, 448, 448, 128] + - [171, 3239.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 64] + - [140, 4164.0] + - - [4, 1024, 1, 128, 4, 4, 4, 1024] + - [180, 145.0] + - - [64, 256, 1, 3328, 64, 64, 64, 256] + - [144, 1819.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [133, 1764.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [139, 1128.0] + - - [128, 4, 1, 256, 128, 128, 128, 4] + - [139, 26.0] + - - [256, 256, 1, 1280, 256, 256, 256, 256] + - [204, 3673.0] + - - [256, 128, 1, 3328, 256, 256, 256, 128] + - [170, 2371.0] + - - [448, 4, 1, 128, 448, 448, 448, 4] + - [146, 61.0] + - - [4, 256, 1, 3328, 4, 4, 4, 256] + - [144, 113.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [146, 18.0] + - - [4, 256, 1, 1280, 4, 4, 4, 256] + - [139, 96.0] + - - [64, 4, 1, 3328, 64, 64, 64, 4] + - [142, 28.0] + - - [4, 64, 1, 3328, 4, 4, 4, 64] + - [142, 28.0] + - - [4, 1024, 1, 256, 4, 4, 4, 1024] + - [139, 199.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [170, 912.0] + - - [4, 64, 1, 256, 4, 4, 4, 64] + - [137, 12.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [133, 1782.0] + - - [64, 448, 1, 128, 64, 64, 64, 448] + - [146, 1025.0] + - - [64, 704, 1, 3328, 64, 64, 64, 704] + - [140, 2903.0] + - - [128, 448, 1, 3328, 128, 128, 128, 448] + - [171, 3659.0] + - - [4, 448, 1, 256, 4, 4, 4, 448] + - [185, 90.0] + - - [4, 128, 1, 1280, 4, 4, 4, 128] + - [139, 47.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [170, 298.0] + - - [64, 64, 1, 128, 64, 64, 64, 64] + - [211, 154.0] + - - [64, 4, 1, 1280, 64, 64, 64, 4] + - [155, 24.0] + - - [256, 64, 1, 3328, 256, 256, 256, 64] + - [144, 1821.0] + - - [128, 128, 1, 256, 128, 128, 128, 128] + - [170, 892.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [132, 3731.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [164, 3024.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [132, 3965.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [132, 1307.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [132, 899.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [196, 1124.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [214, 4294.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [201, 2824.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [132, 1725.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [132, 3392.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [196, 2516.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [132, 1503.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [132, 2344.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [196, 2134.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [132, 1945.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [138, 4550.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [132, 2722.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [181, 2862.0] + - - [768, 2, 1, 16, 768, 768, 768, 2] + - [144, 13.0] + - - [768, 2, 1, 32, 768, 768, 768, 2] + - [131, 19.0] + - - [3, 64, 64, 2888, 3, 3, 3, 64] + - [172, 498.0] + - - [1, 16, 64, 640, 1, 1, 1, 16] + - [166, 69.0] + - - [512, 24, 36, 800, 512, 512, 512, 24] + - [202, 4560.0] + - - [16, 32, 36, 360, 16, 16, 16, 32] + - [135, 1018.0] + - - [1, 16, 36, 1440, 1, 1, 1, 16] + - [142, 52.0] + - - [512, 24, 64, 512, 512, 512, 512, 24] + - [202, 4746.0] + - - [3, 64, 36, 3136, 3, 3, 3, 64] + - [205, 488.0] + - - [256, 24, 64, 32, 256, 256, 256, 24] + - [132, 2208.0] + - - [256, 16, 36, 3200, 256, 256, 256, 16] + - [185, 2905.0] + - - [256, 16, 36, 32, 256, 256, 256, 16] + - [131, 1542.0] + - - [512, 24, 36, 288, 512, 512, 512, 24] + - [202, 4304.0] + - - [512, 24, 64, 128, 512, 512, 512, 24] + - [132, 4317.0] + - - [3, 64, 64, 1444, 3, 3, 3, 64] + - [141, 480.0] + - - [16, 32, 64, 160, 16, 16, 16, 32] + - [199, 1219.0] + - - [256, 16, 64, 32, 256, 256, 256, 16] + - [137, 1978.0] + - - [256, 16, 64, 1568, 256, 256, 256, 16] + - [190, 3272.0] + - - [256, 24, 36, 128, 256, 256, 256, 24] + - [132, 3031.0] + - - [16, 32, 64, 2560, 16, 16, 16, 32] + - [166, 2319.0] + - - [49, 800, 1, 128, 49, 49, 49, 800] + - [146, 1248.0] + - - [64, 12, 2520, 12, 64, 64, 64, 12] + - [164, 1713.0] + - - [64, 13, 2336, 13, 64, 64, 64, 13] + - [132, 1923.0] + - - [64, 14, 2184, 14, 64, 64, 64, 14] + - [132, 2101.0] + - - [64, 15, 2048, 15, 64, 64, 64, 15] + - [132, 2248.0] + - - [64, 16, 1920, 16, 64, 64, 64, 16] + - [137, 2604.0] + - - [64, 17, 1816, 17, 64, 64, 64, 17] + - [132, 2389.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [132, 2484.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [132, 2648.0] + - - [64, 21, 1488, 21, 64, 64, 64, 21] + - [132, 2925.0] + - - [64, 23, 1360, 23, 64, 64, 64, 23] + - [132, 3162.0] + - - [64, 25, 1256, 25, 64, 64, 64, 25] + - [138, 3470.0] + - - [64, 27, 1168, 27, 64, 64, 64, 27] + - [138, 3692.0] + - - [64, 29, 1088, 29, 64, 64, 64, 29] + - [138, 3920.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 2] + - [137, 139.0] + - - [1024, 2, 1, 3072, 1024, 1024, 1024, 2] + - [142, 214.0] + - - [1024, 2, 1, 6, 1024, 1024, 1024, 2] + - [129, 7.0] + - - [3, 64, 512, 3, 3, 3, 3, 64] + - [135, 91.0] + - - [9, 64, 512, 9, 9, 9, 9, 64] + - [135, 664.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 1] + - [139, 44.0] + - - [5, 64, 512, 5, 5, 5, 5, 64] + - [135, 237.0] + - - [1024, 2, 1, 1, 1024, 1024, 1024, 2] + - [129, 1.0] + - - [1024, 2, 1, 2048, 1024, 1024, 1024, 2] + - [142, 200.0] + - - [17, 64, 1, 15, 17, 17, 17, 64] + - [135, 8.0] + - - [17, 64, 1, 17, 17, 17, 17, 64] + - [144, 10.0] + - - [30, 64, 1, 30, 30, 30, 30, 64] + - [172, 26.0] + - - [30, 64, 1, 31, 30, 30, 30, 64] + - [144, 28.0] + - - [31, 64, 1, 31, 31, 31, 31, 64] + - [144, 29.0] + - - [64, 17, 1, 15, 64, 64, 64, 17] + - [201, 9.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [142, 9.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [144, 27.0] + - - [64, 30, 1, 31, 64, 64, 64, 30] + - [157, 28.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [144, 28.0] + - - [14, 64, 1, 14, 14, 14, 14, 64] + - [144, 7.0] + - - [15, 64, 1, 14, 15, 15, 15, 64] + - [137, 7.0] + - - [15, 64, 1, 15, 15, 15, 15, 64] + - [139, 8.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [137, 7.0] + - - [64, 15, 1, 14, 64, 64, 64, 15] + - [188, 8.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [172, 8.0] + - - [1024, 2, 1, 32, 1024, 1024, 1024, 2] + - [129, 26.0] + - - [1024, 2, 1, 4, 1024, 1024, 1024, 2] + - [129, 5.0] + - - [512, 32, 1, 1600, 512, 512, 512, 32] + - [144, 1618.0] + - - [1024, 64, 1, 960, 1024, 1024, 1024, 64] + - [154, 3472.0] + - - [512, 64, 1, 512, 512, 512, 512, 64] + - [199, 1928.0] + - - [384, 128, 1, 128, 384, 384, 384, 128] + - [163, 1622.0] + - - [384, 128, 1, 256, 384, 384, 384, 128] + - [163, 2070.0] + - - [64, 64, 1, 64, 64, 64, 64, 64] + - [129, 99.0] + - - [256, 4, 1, 4096, 256, 256, 256, 4] + - [144, 116.0] + - - [25, 256, 120, 128, 25, 25, 25, 256] + - [210, 4241.0] + - - [25, 256, 18, 128, 25, 25, 25, 256] + - [179, 2194.0] + - - [25, 256, 19, 128, 25, 25, 25, 256] + - [179, 2289.0] + - - [9, 256, 120, 128, 9, 9, 9, 256] + - [135, 1644.0] + - - [9, 256, 18, 128, 9, 9, 9, 256] + - [199, 1002.0] + - - [9, 256, 19, 128, 9, 9, 9, 256] + - [135, 1065.0] + - - [1024, 2, 1, 10, 1024, 1024, 1024, 2] + - [135, 11.0] + - - [1024, 2, 1, 1280, 1024, 1024, 1024, 2] + - [142, 179.0] + - - [1024, 2, 1, 39, 1024, 1024, 1024, 2] + - [131, 30.0] + - - [1024, 2, 1, 40, 1024, 1024, 1024, 2] + - [131, 30.0] + - - [1024, 2, 1, 41, 1024, 1024, 1024, 2] + - [197, 31.0] + - - [1024, 2, 1, 5, 1024, 1024, 1024, 2] + - [129, 6.0] + - - [1024, 2, 1, 2560, 1024, 1024, 1024, 2] + - [142, 210.0] + - - [1024, 2, 1, 8, 1024, 1024, 1024, 2] + - [131, 9.0] + - - [1024, 2, 1, 1024, 1024, 1024, 1024, 2] + - [137, 169.0] + - - [1024, 2, 1, 9, 1024, 1024, 1024, 2] + - [135, 10.0] + - - [1024, 2, 1, 1152, 1024, 1024, 1024, 2] + - [185, 174.0] + - - [4, 64, 32768, 4, 4, 4, 4, 64] + - [147, 350.0] + - - [4, 64, 38400, 4, 4, 4, 4, 64] + - [130, 358.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [194, 368.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [132, 359.0] + - - [14, 64, 10880, 14, 14, 14, 14, 64] + - [194, 2245.0] + - - [15, 64, 10880, 14, 15, 15, 15, 64] + - [162, 2266.0] + - - [15, 64, 7680, 15, 15, 15, 15, 64] + - [194, 2548.0] + - - [15, 64, 10880, 15, 15, 15, 15, 64] + - [194, 2309.0] + - - [17, 64, 7680, 15, 17, 17, 17, 64] + - [162, 2022.0] + - - [17, 64, 6144, 17, 17, 17, 17, 64] + - [194, 2284.0] + - - [17, 64, 7680, 17, 17, 17, 17, 64] + - [194, 2326.0] + - - [21, 64, 6144, 17, 21, 21, 21, 64] + - [130, 2493.0] + - - [21, 64, 6144, 21, 21, 21, 21, 64] + - [179, 2948.0] + - - [24, 64, 4736, 24, 24, 24, 24, 64] + - [162, 3899.0] + - - [30, 64, 2048, 30, 30, 30, 30, 64] + - [200, 3759.0] + - - [30, 64, 2048, 31, 30, 30, 30, 64] + - [200, 3812.0] + - - [31, 64, 2048, 31, 31, 31, 31, 64] + - [130, 3902.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [181, 2281.0] + - - [64, 15, 10880, 14, 64, 64, 64, 15] + - [132, 2409.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [149, 2650.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [196, 2507.0] + - - [64, 17, 7680, 15, 64, 64, 64, 17] + - [132, 2794.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [132, 2908.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [132, 2936.0] + - - [64, 21, 6144, 17, 64, 64, 64, 21] + - [149, 3136.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [132, 3726.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [132, 4191.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [186, 4704.0] + - - [64, 30, 2048, 31, 64, 64, 64, 30] + - [138, 4777.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [138, 4815.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [203, 1907.0] + - - [5, 64, 1, 5, 5, 5, 5, 64] + - [129, 1.0] + - - [33, 32, 1, 33, 33, 33, 33, 32] + - [131, 13.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1] + - [142, 95.0] + - - [5, 64, 960, 5, 5, 5, 5, 64] + - [162, 324.0] + - - [27, 128, 32768, 27, 27, 27, 27, 128] + - [192, 2008.0] + - - [1024, 2, 1, 16, 1024, 1024, 1024, 2] + - [144, 17.0] + - - [1024, 2, 1, 64, 1024, 1024, 1024, 2] + - [146, 43.0] + - - [13, 512, 1, 3456, 13, 13, 13, 512] + - [176, 734.0] + - - [13, 512, 1, 4096, 13, 13, 13, 512] + - [144, 750.0] + - - [13, 512, 1, 864, 13, 13, 13, 512] + - [139, 567.0] + - - [256, 1, 1, 3456, 256, 256, 256, 1] + - [139, 28.0] + - - [256, 1, 1, 4096, 256, 256, 256, 1] + - [144, 29.0] + - - [256, 1, 1, 864, 256, 256, 256, 1] + - [139, 22.0] + - - [256, 128, 1, 3456, 256, 256, 256, 128] + - [139, 2372.0] + - - [256, 128, 1, 4096, 256, 256, 256, 128] + - [185, 2524.0] + - - [256, 128, 1, 864, 256, 256, 256, 128] + - [139, 2125.0] + - - [1024, 2, 1, 80, 1024, 1024, 1024, 2] + - [131, 50.0] + - - [1024, 2, 1, 82, 1024, 1024, 1024, 2] + - [131, 52.0] + - - [1024, 2, 1, 12, 1024, 1024, 1024, 2] + - [137, 14.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [181, 3867.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [132, 3948.0] + - - [1024, 2, 1, 128, 1024, 1024, 1024, 2] + - [146, 70.0] + - - [1024, 2, 1, 96, 1024, 1024, 1024, 2] + - [146, 61.0] + - - [768, 2, 1, 2048, 768, 768, 768, 2] + - [142, 151.0] + - - [1024, 81, 1, 1024, 1024, 1024, 1024, 81] + - [130, 3670.0] + - - [2, 1024, 1, 6, 2, 2, 2, 1024] + - [129, 7.0] + - - [1024, 2, 1, 20, 1024, 1024, 1024, 2] + - [144, 20.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB.yaml new file mode 100644 index 000000000..35f9e7c8c --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB.yaml @@ -0,0 +1,29651 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x8x32_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x8x32_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 24216.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [4, 23424.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [34, 24681.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [18, 24040.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 25295.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [24, 21564.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 24905.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [21, 24308.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [18, 25318.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [21, 23150.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [34, 25308.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [7, 19464.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [24, 24314.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [3, 20447.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [2, 25061.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 24754.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [24, 24077.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [17, 20443.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [6, 21394.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [34, 21925.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [4, 21509.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 24884.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [30, 22136.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 25222.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [24, 22941.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [4, 22568.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 25150.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [4, 23840.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [26, 22809.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 23921.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 25365.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [30, 23710.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 24636.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [26, 22779.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [27, 19784.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 24805.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [4, 24096.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 25567.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [17, 21298.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [4, 24642.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [4, 23330.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [19, 25647.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [4, 24075.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [7, 18869.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [4, 24971.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [17, 18928.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [10, 22555.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [33, 22271.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [3, 20841.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [18, 22845.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [18, 19977.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 21290.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [3, 19356.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [17, 17088.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [7, 19364.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [24, 21890.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [30, 20916.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [0, 21782.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 20662.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [4, 19958.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [3, 21109.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [24, 25083.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [0, 20757.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [4, 22680.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 24808.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [18, 23895.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [30, 23850.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 24660.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [24, 24598.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [30, 21359.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [17, 20531.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [4, 25186.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 24629.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 24299.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [30, 24284.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [4, 17098.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 25490.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 24074.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 24081.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [4, 21338.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [3, 20381.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [17, 20834.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [7, 22439.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 24025.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [18, 24566.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [7, 22901.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [24, 24788.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [21, 24696.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [21, 21658.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [4, 21117.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 24460.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 23944.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [4, 23950.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [10, 25351.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [4, 22543.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [2, 25585.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [7, 24020.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [21, 19784.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [10, 22238.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 25101.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [20, 17812.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [4, 24587.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [4, 23824.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [18, 22791.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [7, 24256.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 25370.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 24250.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [7, 23856.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [18, 24737.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [24, 24182.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 23212.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [7, 23108.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [4, 20925.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [34, 19720.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [4, 20341.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [4, 23993.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [25, 25253.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [6, 21670.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [4, 23902.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [10, 23748.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 21992.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [3, 20854.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [35, 25317.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [7, 22543.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [7, 22280.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 25172.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [6, 18457.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [34, 25267.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [4, 24623.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [17, 17513.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [7, 24461.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [19, 25020.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 23836.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 24367.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [34, 23478.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [3, 20442.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [7, 24051.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [24, 23393.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 23655.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [10, 20015.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [7, 24472.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [24, 21177.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [34, 24887.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 23448.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [18, 23757.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [5, 25576.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [37, 20787.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [8, 16833.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [2, 24673.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [3, 19280.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [4, 24101.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [34, 23804.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 24734.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [18, 23586.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [30, 23831.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [4, 24509.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [0, 19125.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 24606.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [18, 24230.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [4, 24060.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [33, 20121.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [33, 19747.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [24, 24475.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [18, 22343.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [7, 22582.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [3, 17405.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [18, 24304.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [5, 25257.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [25, 25627.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [34, 24910.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [18, 24212.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [18, 23273.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [21, 23765.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [4, 21209.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [24, 23946.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [34, 24778.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [2, 24613.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 24311.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 21336.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [37, 23432.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [4, 23379.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [4, 20919.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [29, 20498.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [10, 24040.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [17, 20973.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [7, 20029.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [33, 20556.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [0, 17365.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [24, 23208.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [7, 22531.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [4, 23976.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [10, 23507.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [14, 17871.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [30, 22325.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 25045.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [26, 25349.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [2, 24780.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [27, 13647.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [4, 22213.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [24, 24921.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [2, 24353.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [3, 20043.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 25343.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 24641.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [24, 24871.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 21368.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 20839.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [7, 21847.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 24979.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [4, 18868.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [3, 18465.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [4, 23698.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [21, 21297.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 25179.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [10, 22978.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21069.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 21827.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 23929.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [5, 20918.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [10, 20324.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [21, 24217.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [7, 21211.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [17, 20990.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [5, 25175.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 25018.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [10, 22897.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 24542.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [34, 23450.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [10, 22390.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 24782.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 24305.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 23410.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [30, 22639.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [18, 24795.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [33, 22400.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [9, 20008.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 25136.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 25025.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 21502.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [34, 24756.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [19, 25395.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [4, 21918.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [21, 24969.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [24, 22922.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [19, 25211.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [18, 22874.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [30, 24212.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [24, 21556.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [4, 24328.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [34, 24851.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [7, 23405.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [4, 20454.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 23255.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [24, 24780.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 23073.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [17, 16197.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 25078.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 24104.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [18, 21276.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [4, 24009.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [18, 21199.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [34, 20324.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [9, 20560.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [24, 24035.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [24, 22109.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [18, 19141.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [18, 23656.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [17, 20130.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [4, 24383.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [4, 24085.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [4, 24724.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 22261.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [24, 23625.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 23646.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [4, 24881.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [12, 18393.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [18, 21067.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [4, 23555.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 24131.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [4, 20410.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [5, 25579.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 21092.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [24, 22922.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [18, 25383.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [30, 20912.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [4, 21654.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [3, 19224.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 21337.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 24508.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [7, 21288.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [18, 21826.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [4, 23223.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [17, 21575.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [30, 24301.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [7, 24330.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [10, 20953.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 21927.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [3, 18196.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [18, 24215.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [7, 22365.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [18, 23167.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [3, 18540.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [4, 21441.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [33, 24002.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [36, 16148.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 24904.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [7, 22723.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [10, 23422.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 24496.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 21034.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [18, 25265.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 20643.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 25309.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [18, 19619.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 24592.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [0, 20157.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [3, 20840.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [4, 21258.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [18, 24132.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [32, 18714.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [7, 24018.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [17, 20998.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [18, 25089.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [4, 23718.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [4, 22577.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [24, 21343.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [10, 21534.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [21, 24601.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [18, 24165.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 21607.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [24, 23311.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 24741.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [6, 21157.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [24, 22890.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21394.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [17, 14947.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [24, 24057.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 20718.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [34, 25168.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [24, 24478.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 21892.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [18, 25368.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 24217.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [34, 21057.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [20, 17865.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 25251.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [36, 19269.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [29, 19974.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [24, 23995.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [4, 19719.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [20, 15396.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 24462.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [4, 24517.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [18, 23418.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 20688.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [10, 22223.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [21, 24740.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [4, 24043.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [9, 20191.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [14, 16978.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 24355.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [24, 25120.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 25387.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [7, 22037.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 25045.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 24091.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [4, 22519.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [4, 24190.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [4, 24502.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [35, 25499.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [18, 23552.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 23802.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [4, 24519.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [9, 19104.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 24099.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [24, 24466.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [9, 20764.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [26, 23154.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 24430.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [23, 16116.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [34, 22643.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [10, 24312.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [18, 21678.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [1, 21056.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 22605.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [25, 25454.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [14, 19394.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 25011.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [31, 25547.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [30, 24688.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [11, 25542.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [10, 22798.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [4, 23877.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [7, 24162.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [6, 19007.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 24033.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [2, 24635.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 21508.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [18, 23764.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 21104.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [4, 24514.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [24, 25076.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [4, 20471.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [34, 19443.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [32, 18268.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [7, 23160.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 22761.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [3, 18266.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 24068.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [4, 25123.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [24, 24402.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 23805.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [5, 25439.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [10, 21034.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [10, 23461.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [18, 23369.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [24, 23245.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [19, 25126.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [4, 21383.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [4, 21200.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [34, 25146.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [2, 24802.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [17, 21020.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 21059.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 22696.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [18, 24240.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [7, 23783.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [5, 25432.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [7, 24181.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [24, 22098.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [4, 23443.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [4, 24628.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [34, 24409.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [30, 23700.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [34, 21151.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [30, 22160.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 25434.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [18, 18954.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [24, 22209.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [17, 20490.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [4, 21030.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [7, 24443.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [24, 21317.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [24, 24438.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 24778.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [14, 18408.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [30, 23724.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [7, 22261.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [18, 21723.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [21, 22024.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 24774.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [4, 22308.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [3, 19966.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [37, 21748.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [10, 20701.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [18, 24759.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [4, 20165.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [34, 22006.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [3, 19587.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 25133.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [18, 24174.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [4, 24169.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [18, 21675.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [4, 22766.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [3, 17246.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [24, 21849.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [30, 22520.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [19, 25577.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [4, 19702.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [24, 22772.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [25, 25281.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [15, 20452.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 20097.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [15, 23664.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [22, 19092.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [4, 21188.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [4, 21600.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [13, 23381.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [13, 22703.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [38, 13356.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [10, 20364.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [4, 20913.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [18, 19816.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 19339.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [42, 19157.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [56, 19070.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [50, 19885.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [50, 18684.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [45, 14732.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [48, 12606.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [62, 17219.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [55, 18668.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [64, 15469.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [41, 15859.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [46, 14880.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [42, 18890.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [50, 18144.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [40, 12627.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [40, 14168.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [46, 10812.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [56, 15644.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [59, 9608.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [54, 12796.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [45, 15036.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [59, 18378.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [51, 20044.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [57, 9943.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [45, 14523.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [44, 6576.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [51, 14772.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [45, 9205.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [40, 15988.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [45, 13436.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [40, 16662.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [42, 13669.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [51, 18886.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [41, 15143.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [60, 19864.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [64, 17150.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [42, 12828.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [58, 17051.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [40, 9986.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [54, 16968.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [40, 13855.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [40, 13784.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [40, 13912.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [56, 17789.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [45, 7253.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [64, 17689.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [41, 16540.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [44, 9289.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [45, 12907.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [56, 16567.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [40, 11704.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [64, 17452.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [41, 18227.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [40, 18830.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [40, 15216.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [53, 13725.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [47, 17548.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [49, 18211.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [64, 17165.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [42, 17938.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [50, 18194.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [58, 12998.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [50, 14761.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [45, 6808.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [42, 17767.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [50, 17963.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [55, 11867.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [45, 15737.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [42, 17898.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [40, 16792.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [45, 16478.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [40, 15560.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [45, 13866.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [58, 9186.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [45, 13589.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [45, 14910.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [60, 17106.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [50, 18460.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [44, 10021.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [49, 18367.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [42, 13647.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [42, 18855.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [56, 16854.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [51, 20829.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [51, 17356.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [62, 12544.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [44, 7667.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [55, 16323.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [64, 14271.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [59, 19048.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [40, 12760.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [45, 6915.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [44, 17846.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [56, 19383.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [59, 18911.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [49, 9558.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [44, 13736.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [53, 8259.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [50, 17170.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [50, 18752.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [66, 15408.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [57, 17865.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [41, 16779.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [49, 18235.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [63, 17018.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [41, 15292.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [63, 14796.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [50, 15816.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [50, 16213.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [43, 19998.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [45, 6948.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [50, 17312.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [41, 14512.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [40, 14448.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [40, 16495.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [62, 14964.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [40, 10852.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [65, 18284.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [49, 13133.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [53, 10450.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [45, 6648.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [50, 13728.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [40, 14028.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [59, 19622.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [57, 12777.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [42, 14828.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [50, 18042.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [45, 16322.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [42, 18438.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [49, 9204.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [48, 16991.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [48, 17708.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [55, 18368.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [40, 10162.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [46, 18811.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [42, 18971.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [40, 16644.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [48, 12123.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [51, 18408.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [42, 18025.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [56, 18973.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [62, 14819.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [46, 18500.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [42, 19067.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [45, 12980.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [40, 14537.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [42, 14211.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [51, 19450.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [50, 13680.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [42, 18246.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [51, 17839.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [53, 12336.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [42, 13680.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [40, 17326.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [59, 15875.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [42, 17507.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [58, 17040.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [44, 13061.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [49, 13570.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [56, 16436.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [58, 17933.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [40, 9623.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [44, 11379.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [65, 17784.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [49, 12901.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [44, 15135.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [54, 15107.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [59, 13188.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [42, 18777.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [59, 18255.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [62, 17836.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [61, 16891.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [61, 16336.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [39, 12115.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [57, 19481.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [52, 16536.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [71, 8457.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [101, 8866.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [70, 5790.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [96, 7283.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [101, 9036.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [82, 9091.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [67, 9138.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [70, 5196.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [88, 9504.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [82, 6179.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [71, 6132.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [67, 10141.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [94, 7086.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [105, 7460.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [88, 5478.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [102, 7216.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [71, 6921.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [96, 5248.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [70, 7876.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [75, 6084.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [67, 9952.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [91, 3581.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [67, 9030.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [82, 9872.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [82, 8198.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [67, 9204.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [88, 8643.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [70, 5855.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [94, 5059.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [82, 5059.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [85, 7522.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [76, 7778.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [88, 4501.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [70, 5974.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [88, 9323.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [75, 8584.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [88, 6916.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [75, 4033.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [75, 3754.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [93, 3076.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [70, 6571.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [82, 9137.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [76, 5780.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [88, 3811.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [67, 8693.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [94, 8865.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [94, 9014.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [75, 9007.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [88, 9578.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [88, 8440.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [88, 5312.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [75, 5252.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [101, 8231.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [88, 5196.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [70, 6530.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [67, 8334.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [80, 8047.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [82, 9085.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [75, 8970.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [68, 9157.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [94, 8462.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [101, 8925.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [74, 2965.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [76, 4619.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [71, 7009.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [71, 7282.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [67, 6505.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [75, 8956.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [79, 5948.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [94, 5432.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [82, 8976.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [82, 8766.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [67, 9724.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [67, 9481.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [70, 7682.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [85, 6698.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [85, 7199.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [68, 5146.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [76, 8775.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [82, 5992.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [70, 7643.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [94, 5196.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [108, 3060.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [85, 5915.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [76, 6193.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [75, 6318.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [105, 5560.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [67, 8947.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [68, 8077.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [94, 6181.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [88, 9268.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [98, 6724.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [84, 2988.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [94, 9287.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [67, 8094.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [83, 9913.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [70, 6734.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [110, 5273.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [122, 4702.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [122, 3157.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [124, 3008.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [121, 7544.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [113, 423.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [122, 2845.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [118, 7120.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [116, 509.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [119, 7149.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [122, 711.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [123, 1685.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [112, 71.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [111, 5284.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [116, 3283.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [114, 753.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [120, 2521.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [116, 252.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [122, 3784.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [109, 5780.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [122, 648.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [123, 1002.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [117, 1330.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [112, 100.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [114, 1504.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [122, 838.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [115, 5362.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [123, 1270.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [87, 136.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [73, 2167.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [73, 271.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [73, 73.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [73, 1085.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [73, 543.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [73, 1107.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [87, 147.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [99, 586.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [73, 293.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [77, 180.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [78, 2936.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [84, 4280.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [69, 3740.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [95, 1357.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [70, 4977.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [72, 12.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [69, 4284.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [70, 5049.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [90, 2552.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [104, 1407.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [86, 535.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [77, 45.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [69, 87.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [95, 4382.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [78, 570.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [95, 3678.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [97, 3097.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [73, 3168.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [73, 973.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [73, 1837.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [72, 6.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [95, 2482.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [73, 761.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [90, 2390.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [69, 1775.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [89, 123.0] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [73, 1822.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [84, 2178.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [95, 4351.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [90, 3029.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [69, 4863.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [84, 2857.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [97, 1188.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [73, 1485.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [99, 771.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [69, 1924.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [104, 172.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [95, 2789.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [78, 3383.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [90, 679.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [73, 1477.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [106, 1357.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [73, 1475.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [95, 1786.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [84, 4199.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [107, 383.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [73, 972.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [81, 1510.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [73, 1021.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [73, 514.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [78, 2370.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [89, 13.0] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [95, 4223.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [95, 4862.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [104, 343.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [90, 1928.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [84, 4258.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [84, 3366.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [100, 685.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [77, 245.0] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [95, 4866.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [103, 62.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [92, 777.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH.yaml new file mode 100644 index 000000000..c80cdadce --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH.yaml @@ -0,0 +1,42985 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x32_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [26, 21908.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [16, 20886.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 21616.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [31, 22867.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [3, 19489.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 22392.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 21984.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 21066.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [18, 17290.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [26, 22111.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [16, 17546.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22609.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22282.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [16, 21672.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [15, 17145.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 20403.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [26, 20025.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [11, 19573.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 22573.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 20208.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 22829.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [3, 20802.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [3, 20446.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [21, 22721.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [16, 21240.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [11, 20727.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [11, 21776.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 22982.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [26, 21560.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 22329.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 17893.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [11, 22310.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 21614.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22974.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [11, 18376.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [3, 22071.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [3, 20993.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [31, 21814.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [16, 17262.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [3, 22332.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [3, 16419.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 20558.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 20224.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [31, 16405.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [26, 20595.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [21, 18353.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 18958.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [16, 16132.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [16, 14456.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [26, 17402.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [21, 19736.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 18893.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 18012.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [21, 17225.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [3, 17471.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [2, 16424.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [16, 19597.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 21961.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 21688.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 21702.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 22441.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [3, 22095.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [26, 18542.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 22877.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22217.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 22052.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 22186.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 23067.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [3, 21725.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 21897.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [3, 19624.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [11, 18718.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 17715.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [5, 20403.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 21831.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [31, 22313.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 22384.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 19488.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [16, 19074.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 22128.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 21826.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [3, 20568.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [26, 23060.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 21832.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [5, 18023.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [3, 19940.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22609.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [21, 16498.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [21, 22027.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [3, 21373.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [5, 22143.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [5, 21789.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [11, 21980.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [11, 21305.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 22436.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 21146.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21028.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [21, 19283.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [12, 18652.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 21787.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [18, 20464.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [3, 21467.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [11, 21586.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [5, 20239.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [3, 18984.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [5, 20612.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [16, 20873.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 22710.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [4, 16572.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 22211.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [16, 14913.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 22289.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [26, 21656.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [5, 22239.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [16, 21424.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [11, 18475.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 21796.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [16, 21055.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21551.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [3, 18201.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [18, 22095.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [21, 19585.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [16, 22377.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [3, 21402.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 21696.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [2, 16574.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 22476.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [3, 17179.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21878.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [21, 21543.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [11, 22479.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [11, 21324.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [21, 21778.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [3, 22080.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [11, 17485.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 22438.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [21, 21877.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 22009.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 18543.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 18027.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [26, 22071.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [11, 20181.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [21, 16859.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 22207.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 21964.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [16, 21029.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 21578.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [16, 20977.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 19350.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [18, 21214.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [3, 21219.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [3, 19246.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [26, 18304.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [11, 21817.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [25, 17676.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [18, 18080.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [5, 19059.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [0, 15151.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [31, 21068.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 20560.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [26, 21684.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [26, 21218.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [10, 15485.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [16, 20493.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22614.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 22580.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [16, 22327.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 22191.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [31, 19436.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22978.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [26, 22424.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 19326.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 18030.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [3, 19616.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [26, 22408.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [31, 17012.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [25, 15731.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [21, 21683.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [18, 19107.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [11, 22814.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [16, 20798.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [21, 18175.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [26, 19958.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22048.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [11, 20257.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 22034.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [3, 19519.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [31, 18711.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 22684.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 22743.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [3, 20680.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 22249.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [26, 21668.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [16, 20261.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [21, 22434.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [21, 21605.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 21123.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [31, 20721.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [21, 22589.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [11, 20507.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [2, 16903.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22848.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [26, 21678.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 19607.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22532.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [3, 20814.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 21858.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [31, 20597.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 22716.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [26, 20978.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 22035.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22215.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [26, 22314.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [21, 21082.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [3, 18828.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 21244.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 21018.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [11, 14695.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22775.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 21883.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [11, 19635.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [16, 21680.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [16, 19450.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [26, 18779.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [26, 18051.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [26, 21698.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [26, 17939.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 21363.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [11, 18222.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [11, 21949.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [11, 21701.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [11, 22130.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [11, 20292.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [3, 21672.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 21633.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [11, 22529.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [11, 17289.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 19626.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 21547.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 21936.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [16, 18580.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 19574.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [11, 20809.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [11, 20707.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [3, 18317.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 19574.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 22229.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [11, 19290.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [21, 19987.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [21, 19491.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22065.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [26, 22066.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 19275.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 20007.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [2, 15514.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [3, 22029.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [5, 20414.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 21110.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [3, 16812.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [11, 19707.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21720.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22505.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 20472.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [26, 21164.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 22315.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [31, 18387.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22806.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [25, 17470.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [26, 22890.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [16, 18246.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 22151.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [16, 16910.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [26, 18290.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [11, 19527.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [11, 21587.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [11, 17335.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [11, 21863.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 19144.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22787.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 21574.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 20592.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [3, 19742.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [31, 19194.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 22270.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [26, 22016.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 18920.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 22252.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [12, 20481.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [26, 19759.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [11, 18799.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [16, 14362.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 18660.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [11, 22752.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [3, 22023.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 19751.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22190.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [17, 16196.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 21499.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [26, 18071.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [11, 17886.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [31, 14216.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 21695.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [11, 21137.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 17795.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 20320.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [21, 22126.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 22024.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [29, 15706.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 23029.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [5, 20177.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [26, 21195.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [16, 21644.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [3, 20028.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 21681.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [16, 17787.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21992.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22315.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [15, 14076.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [16, 20654.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [21, 20106.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 20580.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [2, 15595.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22560.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 22455.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [3, 21026.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [16, 21640.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [16, 21719.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [3, 17615.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [31, 21786.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 18805.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 21552.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 18802.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22775.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [3, 18972.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [3, 18442.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [22, 16312.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [26, 20763.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 20821.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [3, 16888.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 21883.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [21, 22878.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [26, 22221.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [11, 21683.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [3, 20178.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 21378.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [3, 21391.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [3, 21034.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [16, 19858.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [26, 19310.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [3, 19324.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 20787.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [21, 21381.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21513.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 21994.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [16, 20432.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [31, 22416.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [3, 21785.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [16, 21184.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [3, 19549.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [31, 20258.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 22916.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [3, 17500.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 18172.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [11, 19238.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [26, 21400.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 19501.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22555.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [2, 16371.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [21, 21582.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [3, 20288.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [16, 19968.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 20179.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [26, 22534.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 20574.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [16, 18360.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [11, 18904.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [31, 18656.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [16, 20364.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [21, 17084.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 22884.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 21960.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [26, 19765.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [26, 21277.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [16, 17408.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [11, 21164.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 23056.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [16, 18199.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [31, 21987.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [14, 22771.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 19141.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [21, 22226.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [3, 22910.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [16, 20649.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [21, 19646.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 22199.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [31, 22819.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [31, 22587.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [11, 16228.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [31, 22912.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [16, 21563.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [11, 22661.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [16, 21949.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [3, 22175.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [11, 18837.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [3, 22961.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 19229.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [5, 23149.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [21, 19433.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [14, 18882.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 22093.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [11, 22840.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [3, 22955.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [3, 22977.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [11, 22951.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [3, 22977.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [21, 19282.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 20148.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [21, 22152.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [16, 22537.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [3, 22903.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [24, 22506.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [26, 22302.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [31, 22532.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [3, 22232.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [11, 19172.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [18, 20483.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [3, 22207.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [16, 22886.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [26, 19559.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [21, 22606.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [3, 22903.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [11, 20436.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [21, 22402.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [11, 22237.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 22703.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [11, 22604.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [21, 22373.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [3, 18720.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [31, 19940.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [10, 17501.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 22971.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [3, 21318.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [11, 19752.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [11, 22298.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [16, 15916.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [3, 21207.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [11, 22675.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [11, 19125.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [21, 22856.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [16, 20215.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [21, 22570.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [16, 17067.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [3, 21122.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 20553.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [5, 20829.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [31, 21448.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [11, 21342.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [3, 21566.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [11, 22773.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [21, 22437.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [21, 22601.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [3, 21229.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [16, 20708.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [3, 22095.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [11, 23043.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 22007.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [11, 19517.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [11, 22598.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [31, 22816.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [3, 21833.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 22407.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [16, 17901.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 17305.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [3, 21453.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [3, 18817.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [3, 18623.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [3, 20132.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [11, 21436.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [11, 21375.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [8, 12779.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [3, 18644.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [11, 18449.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [26, 19147.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 17686.0] + - - [128, 128, 512, 64, 128, 128, 128, 64] + - [23, 16279.0] + - - [512, 512, 64, 64, 512, 512, 512, 64] + - [16, 19113.0] + - - [1024, 2048, 1, 2, 1024, 1024, 1024, 2] + - [13, 711.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 19713.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 20160.0] + - - [1024, 2048, 1, 30528, 1024, 1024, 1024, 30528] + - [26, 20395.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21618.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21916.0] + - - [1024, 4096, 1, 30528, 1024, 1024, 1024, 30528] + - [16, 22332.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22237.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22566.0] + - - [256, 8976, 1, 1536, 256, 256, 256, 1536] + - [31, 19070.0] + - - [256, 8976, 1, 2048, 256, 256, 256, 2048] + - [31, 18868.0] + - - [256, 8976, 1, 2304, 256, 256, 256, 2304] + - [31, 19000.0] + - - [256, 8976, 1, 2560, 256, 256, 256, 2560] + - [21, 18888.0] + - - [256, 8976, 1, 2816, 256, 256, 256, 2816] + - [33, 19016.0] + - - [256, 8976, 1, 3072, 256, 256, 256, 3072] + - [21, 19072.0] + - - [256, 8976, 1, 4352, 256, 256, 256, 4352] + - [21, 19415.0] + - - [256, 8976, 1, 4864, 256, 256, 256, 4864] + - [16, 19390.0] + - - [256, 8976, 1, 5376, 256, 256, 256, 5376] + - [31, 19487.0] + - - [256, 8976, 1, 5632, 256, 256, 256, 5632] + - [31, 19410.0] + - - [256, 8976, 1, 5888, 256, 256, 256, 5888] + - [31, 19447.0] + - - [256, 8976, 1, 6144, 256, 256, 256, 6144] + - [31, 19526.0] + - - [256, 8976, 1, 6656, 256, 256, 256, 6656] + - [31, 19529.0] + - - [256, 8976, 1, 7168, 256, 256, 256, 7168] + - [31, 19542.0] + - - [256, 8976, 1, 7424, 256, 256, 256, 7424] + - [26, 19590.0] + - - [256, 8976, 1, 8192, 256, 256, 256, 8192] + - [21, 19479.0] + - - [256, 8976, 1, 8448, 256, 256, 256, 8448] + - [21, 19644.0] + - - [256, 8976, 1, 8960, 256, 256, 256, 8960] + - [3, 19557.0] + - - [256, 8976, 1, 9472, 256, 256, 256, 9472] + - [21, 19560.0] + - - [256, 8976, 1, 9728, 256, 256, 256, 9728] + - [21, 19578.0] + - - [256, 8976, 1, 9984, 256, 256, 256, 9984] + - [16, 19622.0] + - - [256, 8976, 1, 10240, 256, 256, 256, 10240] + - [21, 19383.0] + - - [256, 8976, 1, 10496, 256, 256, 256, 10496] + - [16, 19577.0] + - - [256, 8976, 1, 11008, 256, 256, 256, 11008] + - [21, 19602.0] + - - [256, 8976, 1, 11520, 256, 256, 256, 11520] + - [16, 19599.0] + - - [256, 8976, 1, 12288, 256, 256, 256, 12288] + - [31, 19285.0] + - - [256, 8976, 1, 14336, 256, 256, 256, 14336] + - [31, 19415.0] + - - [256, 8976, 1, 14848, 256, 256, 256, 14848] + - [21, 19690.0] + - - [256, 8976, 1, 15104, 256, 256, 256, 15104] + - [16, 19666.0] + - - [256, 8976, 1, 15872, 256, 256, 256, 15872] + - [31, 19622.0] + - - [256, 8976, 1, 17152, 256, 256, 256, 17152] + - [3, 19657.0] + - - [256, 8976, 1, 19712, 256, 256, 256, 19712] + - [21, 19716.0] + - - [256, 8976, 1, 19968, 256, 256, 256, 19968] + - [11, 19619.0] + - - [256, 8976, 1, 20480, 256, 256, 256, 20480] + - [21, 19160.0] + - - [256, 8976, 1, 20992, 256, 256, 256, 20992] + - [21, 19610.0] + - - [256, 8976, 1, 22016, 256, 256, 256, 22016] + - [11, 19676.0] + - - [256, 8976, 1, 26112, 256, 256, 256, 26112] + - [11, 19414.0] + - - [256, 8976, 1, 33536, 256, 256, 256, 33536] + - [3, 19679.0] + - - [256, 8976, 1, 44505, 256, 256, 256, 44505] + - [28, 19791.0] + - - [256, 32768, 1, 128, 256, 256, 256, 128] + - [14, 20259.0] + - - [480, 32768, 1, 1024, 480, 480, 480, 1024] + - [21, 21047.0] + - - [512, 32768, 1, 256, 512, 512, 512, 256] + - [11, 21762.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1] + - [13, 350.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [2, 17791.0] + - - [1024, 1792, 1, 256, 1024, 1024, 1024, 256] + - [11, 18180.0] + - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] + - [3, 18199.0] + - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] + - [11, 19914.0] + - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] + - [21, 19331.0] + - - [1024, 3328, 1, 256, 1024, 1024, 1024, 256] + - [16, 18867.0] + - - [1024, 3840, 1, 256, 1024, 1024, 1024, 256] + - [11, 19123.0] + - - [1024, 4096, 1, 256, 1024, 1024, 1024, 256] + - [11, 19167.0] + - - [1024, 4608, 1, 256, 1024, 1024, 1024, 256] + - [11, 19272.0] + - - [1024, 4864, 1, 256, 1024, 1024, 1024, 256] + - [11, 19220.0] + - - [1024, 5120, 1, 256, 1024, 1024, 1024, 256] + - [11, 19837.0] + - - [1024, 5632, 1, 256, 1024, 1024, 1024, 256] + - [11, 19860.0] + - - [1024, 6144, 1, 256, 1024, 1024, 1024, 256] + - [11, 19864.0] + - - [1024, 6400, 1, 256, 1024, 1024, 1024, 256] + - [11, 20336.0] + - - [1024, 7168, 1, 256, 1024, 1024, 1024, 256] + - [11, 20170.0] + - - [1024, 7424, 1, 256, 1024, 1024, 1024, 256] + - [26, 20273.0] + - - [1024, 7680, 1, 256, 1024, 1024, 1024, 256] + - [3, 21486.0] + - - [1024, 7936, 1, 256, 1024, 1024, 1024, 256] + - [3, 21430.0] + - - [1024, 8192, 1, 256, 1024, 1024, 1024, 256] + - [11, 21338.0] + - - [1024, 8448, 1, 256, 1024, 1024, 1024, 256] + - [3, 21278.0] + - - [1024, 8704, 1, 256, 1024, 1024, 1024, 256] + - [11, 21178.0] + - - [1024, 8960, 1, 256, 1024, 1024, 1024, 256] + - [11, 21592.0] + - - [1024, 9728, 1, 256, 1024, 1024, 1024, 256] + - [11, 21423.0] + - - [1024, 9984, 1, 256, 1024, 1024, 1024, 256] + - [3, 21372.0] + - - [1024, 10240, 1, 256, 1024, 1024, 1024, 256] + - [3, 21683.0] + - - [1024, 10496, 1, 256, 1024, 1024, 1024, 256] + - [11, 21716.0] + - - [1024, 11008, 1, 256, 1024, 1024, 1024, 256] + - [3, 21538.0] + - - [1024, 11264, 1, 256, 1024, 1024, 1024, 256] + - [11, 21497.0] + - - [1024, 11520, 1, 256, 1024, 1024, 1024, 256] + - [3, 21754.0] + - - [1024, 12288, 1, 256, 1024, 1024, 1024, 256] + - [3, 21671.0] + - - [1024, 13312, 1, 256, 1024, 1024, 1024, 256] + - [3, 21808.0] + - - [1024, 13568, 1, 256, 1024, 1024, 1024, 256] + - [11, 21754.0] + - - [1024, 14336, 1, 256, 1024, 1024, 1024, 256] + - [11, 21872.0] + - - [1024, 14592, 1, 256, 1024, 1024, 1024, 256] + - [11, 21848.0] + - - [1024, 14848, 1, 256, 1024, 1024, 1024, 256] + - [11, 21796.0] + - - [1024, 15104, 1, 256, 1024, 1024, 1024, 256] + - [3, 21793.0] + - - [1024, 16128, 1, 256, 1024, 1024, 1024, 256] + - [3, 21883.0] + - - [1024, 17152, 1, 256, 1024, 1024, 1024, 256] + - [11, 21948.0] + - - [1024, 18944, 1, 256, 1024, 1024, 1024, 256] + - [3, 21937.0] + - - [1024, 19712, 1, 256, 1024, 1024, 1024, 256] + - [3, 22040.0] + - - [1024, 19968, 1, 256, 1024, 1024, 1024, 256] + - [3, 22018.0] + - - [1024, 20480, 1, 256, 1024, 1024, 1024, 256] + - [3, 22028.0] + - - [1024, 20992, 1, 256, 1024, 1024, 1024, 256] + - [3, 22102.0] + - - [1024, 21504, 1, 256, 1024, 1024, 1024, 256] + - [3, 22020.0] + - - [1024, 22016, 1, 256, 1024, 1024, 1024, 256] + - [11, 22079.0] + - - [1024, 23552, 1, 256, 1024, 1024, 1024, 256] + - [3, 22142.0] + - - [1024, 28672, 1, 256, 1024, 1024, 1024, 256] + - [3, 22174.0] + - - [1024, 32768, 1, 512, 1024, 1024, 1024, 512] + - [11, 22525.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22736.0] + - - [1024, 33536, 1, 256, 1024, 1024, 1024, 256] + - [16, 22214.0] + - - [1024, 40448, 1, 256, 1024, 1024, 1024, 256] + - [3, 22340.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 18555.0] + - - [2048, 1024, 1, 1, 2048, 2048, 2048, 1] + - [4, 288.0] + - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] + - [16, 18174.0] + - - [3200, 1024, 1, 2048, 3200, 3200, 3200, 2048] + - [3, 21713.0] + - - [4096, 1024, 1, 1, 4096, 4096, 4096, 1] + - [19, 337.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 21289.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 22173.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 20529.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 21252.0] + - - [1024, 3968, 1, 42720, 1024, 1024, 1024, 42720] + - [11, 21696.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21780.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21974.0] + - - [1024, 6528, 1, 42720, 1024, 1024, 1024, 42720] + - [26, 22263.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 21567.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21936.0] + - - [1024, 7104, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 22006.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21604.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 22091.0] + - - [1024, 7200, 1, 42720, 1024, 1024, 1024, 42720] + - [11, 22271.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21830.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 22229.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 21991.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22448.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22442.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22602.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22234.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22463.0] + - - [1024, 9520, 1, 42720, 1024, 1024, 1024, 42720] + - [14, 22477.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22108.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22344.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22121.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22437.0] + - - [1024, 10080, 1, 42720, 1024, 1024, 1024, 42720] + - [3, 22525.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22339.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 22475.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22715.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22722.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22864.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 4096, 1024] + - [11, 22623.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 4096, 1024] + - [21, 22486.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22865.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22759.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22850.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22699.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22741.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22778.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22776.0] + - - [1024, 3240, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 19835.0] + - - [1024, 3240, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 20307.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 20287.0] + - - [1024, 3960, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 21176.0] + - - [1024, 3960, 1, 42720, 1024, 1024, 1024, 42720] + - [11, 21658.0] + - - [4096, 3240, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 21938.0] + - - [4096, 3960, 1, 1024, 4096, 4096, 4096, 1024] + - [16, 22511.0] + - - [289, 128, 64, 768, 289, 289, 289, 768] + - [11, 15398.0] + - - [289, 160, 64, 768, 289, 289, 289, 768] + - [10, 11268.0] + - - [289, 192, 64, 768, 289, 289, 289, 768] + - [15, 13274.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 64] + - [17, 14757.0] + - - [784, 512, 32, 128, 784, 784, 784, 128] + - [3, 17756.0] + - - [784, 128, 32, 512, 784, 784, 784, 512] + - [3, 17502.0] + - - [196, 1024, 32, 256, 196, 196, 196, 256] + - [21, 15358.0] + - - [3136, 128, 64, 64, 3136, 3136, 3136, 64] + - [4, 17495.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 128] + - [3, 20794.0] + - - [784, 512, 64, 256, 784, 784, 784, 256] + - [3, 19097.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 256] + - [16, 20766.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [3, 21714.0] + - - [196, 1024, 64, 512, 196, 196, 196, 512] + - [11, 16545.0] + - - [784, 256, 64, 512, 784, 784, 784, 512] + - [3, 19359.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [3, 19638.0] + - - [196, 512, 64, 1024, 196, 196, 196, 1024] + - [21, 16686.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [21, 16778.0] + - - [3136, 128, 32, 64, 3136, 3136, 3136, 64] + - [14, 17213.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 128] + - [3, 20715.0] + - - [784, 512, 32, 256, 784, 784, 784, 256] + - [11, 18851.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 256] + - [31, 21063.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [11, 21477.0] + - - [196, 1024, 32, 512, 196, 196, 196, 512] + - [3, 16365.0] + - - [784, 256, 32, 512, 784, 784, 784, 512] + - [26, 18699.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [3, 19383.0] + - - [196, 512, 32, 1024, 196, 196, 196, 1024] + - [21, 16321.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [21, 16553.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [21, 22858.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [11, 21235.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [16, 21478.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [11, 22705.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [3, 22915.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 21493.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [5, 22791.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [11, 20941.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [11, 21245.0] + - - [1024, 4096, 1, 2, 1024, 1024, 1024, 2] + - [25, 573.0] + - - [4096, 512, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 19225.0] + - - [1024, 1280, 1, 2, 1024, 1024, 1024, 2] + - [7, 440.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 19345.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 1024, 4096] + - [12, 21207.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 21191.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 4096, 1024] + - [16, 21799.0] + - - [1024, 4992, 1, 2, 1024, 1024, 1024, 2] + - [19, 755.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 21127.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 21860.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22671.0] + - - [1024, 5120, 1, 2, 1024, 1024, 1024, 2] + - [13, 626.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22084.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22248.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22675.0] + - - [1024, 5248, 1, 2, 1024, 1024, 1024, 2] + - [8, 622.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21562.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21737.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22753.0] + - - [1024, 2560, 1, 2, 1024, 1024, 1024, 2] + - [13, 594.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21417.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 1024, 4096] + - [12, 21727.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 1024] + - [11, 22530.0] + - - [1024, 3072, 1, 2, 1024, 1024, 1024, 2] + - [15, 645.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 20890.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 21148.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22423.0] + - - [1024, 1152, 1, 2, 1024, 1024, 1024, 2] + - [24, 628.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 18550.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 18786.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 20957.0] + - - [479, 32768, 1, 1024, 479, 479, 479, 1024] + - [21, 20800.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22138.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22483.0] + - - [1024, 8192, 1, 33712, 1024, 1024, 1024, 33712] + - [26, 22398.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22378.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22682.0] + - - [1024, 9600, 1, 33712, 1024, 1024, 1024, 33712] + - [24, 22672.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22793.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22778.0] + - - [1024, 1024, 64, 64, 1024, 1024, 1024, 64] + - [2, 18201.0] + - - [1024, 16384, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 22686.0] + - - [1024, 2048, 1, 30592, 1024, 1024, 1024, 30592] + - [3, 20385.0] + - - [640, 2048, 1, 2560, 640, 640, 640, 2560] + - [5, 21216.0] + - - [1024, 1024, 64, 96, 1024, 1024, 1024, 96] + - [3, 20718.0] + - - [1536, 4096, 1, 4608, 1536, 1536, 1536, 4608] + - [3, 21955.0] + - - [512, 512, 256, 64, 512, 512, 512, 64] + - [10, 16161.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 2048, 8192] + - [11, 20266.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22830.0] + - - [1024, 8192, 1, 50304, 1024, 1024, 1024, 50304] + - [12, 22386.0] + - - [1536, 8192, 1, 50304, 1536, 1536, 1536, 50304] + - [26, 22231.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 6144, 1536] + - [3, 23026.0] + - - [1024, 4096, 1, 30592, 1024, 1024, 1024, 30592] + - [16, 22300.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 1536, 6144] + - [3, 21870.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22278.0] + - - [1024, 16384, 1, 50304, 1024, 1024, 1024, 50304] + - [12, 22467.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 3072] + - [3, 21901.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [11, 22576.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 20134.0] + - - [2560, 2048, 1, 7680, 2560, 2560, 2560, 7680] + - [3, 22533.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [11, 19276.0] + - - [2048, 1024, 1, 30592, 2048, 2048, 2048, 30592] + - [11, 20344.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 8192, 2048] + - [3, 22449.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [5, 22571.0] + - - [1536, 8192, 1, 4608, 1536, 1536, 1536, 4608] + - [16, 22641.0] + - - [1024, 2048, 1, 50304, 1024, 1024, 1024, 50304] + - [11, 20362.0] + - - [1024, 1024, 32, 64, 1024, 1024, 1024, 64] + - [16, 19396.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 1536, 6144] + - [11, 22643.0] + - - [1024, 1024, 256, 64, 1024, 1024, 1024, 64] + - [15, 17231.0] + - - [512, 512, 40, 64, 512, 512, 512, 64] + - [17, 17605.0] + - - [1536, 4096, 1, 50304, 1536, 1536, 1536, 50304] + - [28, 21884.0] + - - [1024, 1024, 128, 96, 1024, 1024, 1024, 96] + - [11, 21094.0] + - - [1024, 8192, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 22445.0] + - - [1024, 1024, 128, 64, 1024, 1024, 1024, 64] + - [15, 17261.0] + - - [1024, 4096, 1, 50304, 1024, 1024, 1024, 50304] + - [26, 22258.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 6144, 1536] + - [16, 22836.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 22526.0] + - - [2560, 2048, 1, 1920, 2560, 2560, 2560, 1920] + - [14, 22510.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 2048, 6144] + - [3, 20025.0] + - - [512, 512, 128, 64, 512, 512, 512, 64] + - [26, 19590.0] + - - [1024, 8192, 1, 30592, 1024, 1024, 1024, 30592] + - [18, 22354.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [11, 21610.0] + - - [128, 128, 1024, 64, 128, 128, 128, 64] + - [2, 15192.0] + - - [1024, 8192, 1, 30528, 1024, 1024, 1024, 30528] + - [28, 22372.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 21264.0] + - - [1024, 3456, 1, 512, 1024, 1024, 1024, 512] + - [3, 21069.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 512] + - [3, 20437.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 22034.0] + - - [1024, 6912, 1, 512, 1024, 1024, 1024, 512] + - [3, 21705.0] + - - [256, 55296, 1, 128, 256, 256, 256, 128] + - [3, 20675.0] + - - [256, 6912, 1, 128, 256, 256, 256, 128] + - [14, 15861.0] + - - [480, 3456, 1, 1024, 480, 480, 480, 1024] + - [21, 17629.0] + - - [480, 4096, 1, 1024, 480, 480, 480, 1024] + - [21, 17595.0] + - - [480, 6912, 1, 1024, 480, 480, 480, 1024] + - [21, 19431.0] + - - [512, 3456, 1, 256, 512, 512, 512, 256] + - [31, 15620.0] + - - [512, 4096, 1, 256, 512, 512, 512, 256] + - [11, 16408.0] + - - [512, 55296, 1, 256, 512, 512, 512, 256] + - [3, 22078.0] + - - [512, 6912, 1, 256, 512, 512, 512, 256] + - [3, 20026.0] + - - [1024, 1280, 1, 30528, 1024, 1024, 1024, 30528] + - [33, 21763.0] + - - [1024, 1600, 1, 30528, 1024, 1024, 1024, 30528] + - [5, 18431.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22434.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 22685.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22824.0] + - - [128, 128, 1280, 64, 128, 128, 128, 64] + - [2, 16578.0] + - - [1024, 1640, 1, 30528, 1024, 1024, 1024, 30528] + - [11, 18950.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 22460.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 22634.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 4096, 1024] + - [11, 22832.0] + - - [128, 128, 1312, 64, 128, 128, 128, 64] + - [2, 16583.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21745.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22753.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 21596.0] + - - [512, 512, 192, 64, 512, 512, 512, 64] + - [10, 17824.0] + - - [256, 6912, 1, 1, 256, 256, 256, 1] + - [2, 324.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22362.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22348.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22374.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22640.0] + - - [1024, 10224, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 22642.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22751.0] + - - [1024, 10240, 1, 3072, 1024, 1024, 1024, 3072] + - [21, 22656.0] + - - [1024, 10192, 1, 3072, 1024, 1024, 1024, 3072] + - [16, 22571.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22688.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 22400.0] + - - [1024, 10200, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 22574.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 22297.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 4096, 1024] + - [31, 22693.0] + - - [1024, 10208, 1, 3072, 1024, 1024, 1024, 3072] + - [3, 22682.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22481.0] + - - [1024, 10224, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 22634.0] + - - [1024, 10240, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 22684.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22247.0] + - - [1024, 10192, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 22536.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22277.0] + - - [1024, 10080, 1, 3072, 1024, 1024, 1024, 3072] + - [11, 22515.0] + - - [100352, 512, 1, 256, 100352, 100352, 100352, 256] + - [3, 22349.0] + - - [12544, 2048, 1, 1024, 12544, 12544, 12544, 1024] + - [16, 22814.0] + - - [200704, 512, 1, 256, 200704, 200704, 200704, 256] + - [26, 22467.0] + - - [25088, 1024, 1, 512, 25088, 25088, 25088, 512] + - [3, 22601.0] + - - [50176, 1024, 1, 512, 50176, 50176, 50176, 512] + - [3, 22803.0] + - - [6272, 2048, 1, 1024, 6272, 6272, 6272, 1024] + - [3, 22448.0] + - - [3136, 128, 128, 256, 3136, 3136, 3136, 256] + - [16, 21055.0] + - - [3136, 128, 256, 256, 3136, 3136, 3136, 256] + - [3, 21324.0] + - - [784, 256, 128, 512, 784, 784, 784, 512] + - [3, 19532.0] + - - [784, 256, 256, 512, 784, 784, 784, 512] + - [3, 19702.0] + - - [128, 128, 2048, 64, 128, 128, 128, 64] + - [0, 15172.0] + - - [1024, 2560, 1, 30528, 1024, 1024, 1024, 30528] + - [18, 22450.0] + - - [128, 128, 1536, 64, 128, 128, 128, 64] + - [11, 18332.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 22680.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22448.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22915.0] + - - [1024, 1920, 1, 30528, 1024, 1024, 1024, 30528] + - [21, 21955.0] + - - [128, 128, 192, 64, 128, 128, 128, 64] + - [23, 13808.0] + - - [768, 2048, 1, 2, 768, 768, 768, 2] + - [0, 612.0] + - - [3072, 2048, 1, 768, 3072, 3072, 3072, 768] + - [16, 21542.0] + - - [768, 2048, 1, 3072, 768, 768, 768, 3072] + - [11, 20370.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [16, 19452.0] + - - [384, 384, 144, 64, 384, 384, 384, 64] + - [16, 17843.0] + - - [768, 4608, 1, 2, 768, 768, 768, 2] + - [2, 681.0] + - - [3072, 4608, 1, 768, 3072, 3072, 3072, 768] + - [16, 22395.0] + - - [768, 4608, 1, 3072, 768, 768, 768, 3072] + - [3, 21797.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [16, 20852.0] + - - [512, 512, 48, 64, 512, 512, 512, 64] + - [31, 17976.0] + - - [128, 128, 256, 64, 128, 128, 128, 64] + - [2, 11621.0] + - - [384, 384, 192, 64, 384, 384, 384, 64] + - [26, 18653.0] + - - [1024, 4608, 1, 2, 1024, 1024, 1024, 2] + - [19, 664.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 4096, 1024] + - [11, 22737.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 21755.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 20890.0] + - - [8192, 1024, 1, 1024, 8192, 8192, 8192, 1024] + - [3, 22271.0] + - - [8192, 4096, 1, 1024, 8192, 8192, 8192, 1024] + - [3, 22864.0] + - - [196, 1024, 128, 256, 196, 196, 196, 256] + - [16, 15805.0] + - - [196, 1024, 256, 256, 196, 196, 196, 256] + - [16, 16062.0] + - - [196, 256, 128, 1024, 196, 196, 196, 1024] + - [21, 16086.0] + - - [196, 256, 256, 1024, 196, 196, 196, 1024] + - [31, 16456.0] + - - [196, 512, 128, 1024, 196, 196, 196, 1024] + - [31, 16691.0] + - - [196, 512, 256, 1024, 196, 196, 196, 1024] + - [21, 17020.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 1024] + - [3, 19680.0] + - - [768, 2048, 2, 512, 768, 768, 768, 512] + - [16, 20549.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 1024] + - [3, 18608.0] + - - [864, 2048, 2, 512, 864, 864, 864, 512] + - [3, 18538.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 1024] + - [11, 18551.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 1024] + - [3, 17959.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 1024] + - [3, 18569.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 512] + - [3, 20548.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 512] + - [3, 20638.0] + - - [888, 2048, 2, 512, 888, 888, 888, 512] + - [3, 19073.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 512] + - [3, 19267.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 512] + - [16, 18617.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 1024] + - [26, 18622.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 1024] + - [16, 18574.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 1024] + - [16, 17099.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 512] + - [3, 20409.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 1024] + - [11, 19531.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 1024] + - [3, 17715.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 1024] + - [11, 18563.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 512] + - [26, 18269.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 1024] + - [26, 18934.0] + - - [840, 2048, 2, 512, 840, 840, 840, 512] + - [3, 18120.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 512] + - [26, 19431.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 512] + - [3, 18503.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 1024] + - [3, 17045.0] + - - [13600, 256, 2, 512, 13600, 13600, 13600, 512] + - [16, 20508.0] + - - [12880, 256, 2, 512, 12880, 12880, 12880, 512] + - [31, 20773.0] + - - [12288, 256, 2, 512, 12288, 12288, 12288, 512] + - [31, 21307.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 1024] + - [31, 18566.0] + - - [672, 2048, 2, 512, 672, 672, 672, 512] + - [3, 17612.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 512] + - [16, 19964.0] + - - [13824, 256, 2, 512, 13824, 13824, 13824, 512] + - [11, 21453.0] + - - [15200, 256, 2, 512, 15200, 15200, 15200, 512] + - [26, 21038.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 1024] + - [3, 18647.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 256] + - [11, 20696.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 512] + - [3, 21089.0] + - - [15200, 128, 1, 512, 15200, 15200, 15200, 512] + - [11, 19395.0] + - - [13600, 128, 1, 512, 13600, 13600, 13600, 512] + - [11, 17300.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 256] + - [3, 19156.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 256] + - [11, 18886.0] + - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] + - [3, 20447.0] + - - [24576, 128, 1, 256, 24576, 24576, 24576, 256] + - [3, 18105.0] + - - [24576, 512, 1, 256, 24576, 24576, 24576, 256] + - [3, 21581.0] + - - [25760, 128, 1, 256, 25760, 25760, 25760, 256] + - [11, 17771.0] + - - [25760, 512, 1, 256, 25760, 25760, 25760, 256] + - [16, 21124.0] + - - [6144, 256, 1, 512, 6144, 6144, 6144, 512] + - [3, 19331.0] + - - [6440, 256, 1, 512, 6440, 6440, 6440, 512] + - [3, 16158.0] + - - [13600, 512, 1, 128, 13600, 13600, 13600, 128] + - [3, 18131.0] + - - [9408, 512, 2, 128, 9408, 9408, 9408, 128] + - [12, 18460.0] + - - [56000, 256, 2, 64, 56000, 56000, 56000, 64] + - [27, 16688.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 256] + - [21, 19962.0] + - - [60800, 256, 1, 64, 60800, 60800, 60800, 64] + - [24, 19105.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 256] + - [3, 19994.0] + - - [11776, 512, 2, 128, 11776, 11776, 11776, 128] + - [3, 20203.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 128] + - [16, 19480.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 256] + - [3, 20048.0] + - - [54400, 256, 1, 64, 54400, 54400, 54400, 64] + - [16, 18438.0] + - - [15200, 512, 1, 128, 15200, 15200, 15200, 128] + - [3, 18461.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 256] + - [16, 20116.0] + - - [12672, 512, 2, 128, 12672, 12672, 12672, 128] + - [16, 20430.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 128] + - [3, 20257.0] + - - [46464, 256, 2, 64, 46464, 46464, 46464, 64] + - [14, 19892.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 1024] + - [21, 18741.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 1024] + - [11, 20134.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 256] + - [31, 18510.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 512] + - [3, 19096.0] + - - [45632, 256, 2, 64, 45632, 45632, 45632, 64] + - [31, 17788.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 256] + - [3, 18896.0] + - - [53760, 256, 2, 64, 53760, 53760, 53760, 64] + - [27, 18697.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 1024] + - [21, 17717.0] + - - [47872, 256, 2, 64, 47872, 47872, 47872, 64] + - [31, 18841.0] + - - [47104, 256, 2, 64, 47104, 47104, 47104, 64] + - [32, 18574.0] + - - [50688, 256, 2, 64, 50688, 50688, 50688, 64] + - [3, 18844.0] + - - [45056, 256, 2, 64, 45056, 45056, 45056, 64] + - [26, 18987.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 128] + - [16, 20470.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 256] + - [3, 18193.0] + - - [11264, 512, 2, 128, 11264, 11264, 11264, 128] + - [16, 19770.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 512] + - [11, 17929.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 128] + - [16, 20225.0] + - - [37632, 256, 2, 64, 37632, 37632, 37632, 64] + - [31, 19001.0] + - - [51520, 256, 2, 64, 51520, 51520, 51520, 64] + - [16, 18287.0] + - - [14000, 512, 2, 128, 14000, 14000, 14000, 128] + - [21, 19292.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 128] + - [20, 18914.0] + - - [64512, 256, 2, 64, 64512, 64512, 64512, 64] + - [31, 19422.0] + - - [54400, 256, 2, 64, 54400, 54400, 54400, 64] + - [24, 19935.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 256] + - [16, 20583.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 128] + - [26, 20792.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 256] + - [16, 20798.0] + - - [55296, 256, 2, 256, 55296, 55296, 55296, 256] + - [11, 22003.0] + - - [51520, 256, 2, 256, 51520, 51520, 51520, 256] + - [11, 21854.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 512] + - [26, 19737.0] + - - [60800, 256, 2, 256, 60800, 60800, 60800, 256] + - [3, 22106.0] + - - [54400, 256, 2, 256, 54400, 54400, 54400, 256] + - [16, 22074.0] + - - [60800, 256, 2, 64, 60800, 60800, 60800, 64] + - [31, 19525.0] + - - [3800, 1024, 1, 256, 3800, 3800, 3800, 256] + - [3, 20011.0] + - - [3400, 1024, 1, 256, 3400, 3400, 3400, 256] + - [3, 19554.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 256] + - [3, 20707.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 256] + - [21, 19964.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 128] + - [3, 20481.0] + - - [49152, 256, 2, 256, 49152, 49152, 49152, 256] + - [3, 22047.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 128] + - [11, 20330.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 128] + - [3, 20179.0] + - - [42240, 256, 2, 64, 42240, 42240, 42240, 64] + - [9, 19751.0] + - - [1008, 2048, 2, 512, 1008, 1008, 1008, 512] + - [3, 20757.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 256] + - [11, 20068.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 128] + - [21, 20887.0] + - - [56832, 256, 2, 64, 56832, 56832, 56832, 64] + - [11, 19636.0] + - - [43008, 256, 2, 64, 43008, 43008, 43008, 64] + - [3, 19226.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 128] + - [3, 20651.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 256] + - [3, 20189.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 128] + - [26, 20933.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 1024] + - [26, 20413.0] + - - [55296, 256, 2, 64, 55296, 55296, 55296, 64] + - [11, 19429.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 1024] + - [11, 17088.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 128] + - [30, 20620.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 256] + - [11, 20614.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 256] + - [3, 21322.0] + - - [49152, 256, 2, 64, 49152, 49152, 49152, 64] + - [11, 19294.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 256] + - [21, 20504.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 256] + - [3, 20954.0] + - - [6912, 256, 1, 512, 6912, 6912, 6912, 512] + - [11, 18765.0] + - - [6800, 256, 1, 512, 6800, 6800, 6800, 512] + - [21, 17016.0] + - - [27648, 128, 1, 256, 27648, 27648, 27648, 256] + - [3, 20414.0] + - - [27200, 128, 1, 256, 27200, 27200, 27200, 256] + - [11, 18302.0] + - - [30400, 128, 1, 256, 30400, 30400, 30400, 256] + - [3, 19010.0] + - - [7600, 256, 1, 512, 7600, 7600, 7600, 512] + - [3, 20116.0] + - - [6144, 1024, 1, 512, 6144, 6144, 6144, 512] + - [3, 21276.0] + - - [6912, 1024, 1, 512, 6912, 6912, 6912, 512] + - [11, 21773.0] + - - [6440, 1024, 1, 512, 6440, 6440, 6440, 512] + - [3, 20758.0] + - - [27648, 512, 1, 256, 27648, 27648, 27648, 256] + - [3, 21648.0] + - - [1728, 2048, 1, 1024, 1728, 1728, 1728, 1024] + - [11, 19850.0] + - - [27200, 512, 1, 256, 27200, 27200, 27200, 256] + - [26, 21269.0] + - - [6800, 1024, 1, 512, 6800, 6800, 6800, 512] + - [11, 21163.0] + - - [7600, 1024, 1, 512, 7600, 7600, 7600, 512] + - [3, 21608.0] + - - [30400, 512, 1, 256, 30400, 30400, 30400, 256] + - [16, 21439.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [3, 22393.0] + - - [173280, 128, 1, 64, 173280, 173280, 173280, 64] + - [30, 18590.0] + - - [231040, 128, 1, 64, 231040, 231040, 231040, 64] + - [3, 19528.0] + - - [25992, 128, 1, 64, 25992, 25992, 25992, 64] + - [0, 14890.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 1024] + - [11, 18253.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 1024] + - [3, 17290.0] + - - [850, 2048, 2, 512, 850, 850, 850, 512] + - [3, 18823.0] + - - [805, 2048, 2, 512, 805, 805, 805, 512] + - [3, 17960.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 1024] + - [11, 18949.0] + - - [713, 2048, 2, 512, 713, 713, 713, 512] + - [3, 17955.0] + - - [850, 2048, 1, 512, 850, 850, 850, 512] + - [3, 16951.0] + - - [660, 2048, 2, 512, 660, 660, 660, 512] + - [26, 16470.0] + - - [726, 2048, 2, 512, 726, 726, 726, 512] + - [26, 18586.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 1024] + - [11, 18354.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 1024] + - [3, 18966.0] + - - [748, 2048, 2, 512, 748, 748, 748, 512] + - [3, 18868.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 256] + - [11, 19086.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 256] + - [3, 18980.0] + - - [950, 2048, 1, 512, 950, 950, 950, 512] + - [11, 16422.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 256] + - [26, 19670.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 256] + - [3, 19476.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 256] + - [3, 20007.0] + - - [950, 2048, 2, 512, 950, 950, 950, 512] + - [3, 18820.0] + - - [1610, 2048, 1, 1024, 1610, 1610, 1610, 1024] + - [3, 19497.0] + - - [1700, 2048, 1, 1024, 1700, 1700, 1700, 1024] + - [3, 18921.0] + - - [1900, 2048, 1, 1024, 1900, 1900, 1900, 1024] + - [3, 21047.0] + - - [1444, 256, 120, 128, 1444, 1444, 1444, 128] + - [3, 19467.0] + - - [1444, 256, 139, 128, 1444, 1444, 1444, 128] + - [31, 19512.0] + - - [1444, 256, 160, 128, 1444, 1444, 1444, 128] + - [3, 19583.0] + - - [1444, 256, 18, 128, 1444, 1444, 1444, 128] + - [3, 17885.0] + - - [1444, 256, 19, 128, 1444, 1444, 1444, 128] + - [11, 18355.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [3, 20538.0] + - - [1444, 256, 139, 256, 1444, 1444, 1444, 256] + - [3, 20628.0] + - - [1444, 256, 160, 256, 1444, 1444, 1444, 256] + - [3, 20641.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [3, 19248.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [3, 19629.0] + - - [361, 256, 120, 512, 361, 361, 361, 512] + - [3, 20006.0] + - - [361, 256, 139, 512, 361, 361, 361, 512] + - [3, 19648.0] + - - [361, 256, 160, 512, 361, 361, 361, 512] + - [3, 19372.0] + - - [361, 256, 18, 512, 361, 361, 361, 512] + - [31, 16886.0] + - - [361, 256, 19, 512, 361, 361, 361, 512] + - [31, 17973.0] + - - [200716, 128, 1, 64, 200716, 200716, 200716, 64] + - [17, 18253.0] + - - [27436, 128, 1, 64, 27436, 27436, 27436, 64] + - [23, 13394.0] + - - [1024, 1024, 160, 96, 1024, 1024, 1024, 96] + - [3, 21149.0] + - - [1920, 16384, 1, 25216, 1920, 1920, 1920, 25216] + - [16, 22660.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 3840, 1920] + - [14, 23362.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 1920, 3840] + - [3, 22997.0] + - - [960, 16384, 1, 1920, 960, 960, 960, 1920] + - [20, 21277.0] + - - [1920, 16384, 1, 2880, 1920, 1920, 1920, 2880] + - [14, 23575.0] + - - [1024, 1024, 40, 96, 1024, 1024, 1024, 96] + - [3, 20256.0] + - - [1920, 4096, 1, 25216, 1920, 1920, 1920, 25216] + - [11, 22498.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 3840, 1920] + - [14, 23330.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 1920, 3840] + - [3, 22754.0] + - - [960, 4096, 1, 1920, 960, 960, 960, 1920] + - [11, 20173.0] + - - [1920, 4096, 1, 2880, 1920, 1920, 1920, 2880] + - [14, 23061.0] + - - [1024, 1024, 80, 96, 1024, 1024, 1024, 96] + - [11, 20869.0] + - - [1920, 8192, 1, 25216, 1920, 1920, 1920, 25216] + - [11, 22657.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 3840, 1920] + - [24, 23373.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 1920, 3840] + - [16, 22814.0] + - - [960, 8192, 1, 1920, 960, 960, 960, 1920] + - [14, 21175.0] + - - [1920, 8192, 1, 2880, 1920, 1920, 1920, 2880] + - [14, 23536.0] + - - [1024, 1024, 96, 96, 1024, 1024, 1024, 96] + - [3, 20980.0] + - - [2304, 16384, 1, 12672, 2304, 2304, 2304, 12672] + - [3, 22783.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [16, 22962.0] + - - [576, 16384, 1, 2304, 576, 576, 576, 2304] + - [3, 20398.0] + - - [2304, 16384, 1, 1728, 2304, 2304, 2304, 1728] + - [14, 23469.0] + - - [1024, 1024, 24, 96, 1024, 1024, 1024, 96] + - [11, 19897.0] + - - [2304, 4096, 1, 12672, 2304, 2304, 2304, 12672] + - [1, 22821.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [3, 22664.0] + - - [576, 4096, 1, 2304, 576, 576, 576, 2304] + - [5, 19651.0] + - - [2304, 4096, 1, 1728, 2304, 2304, 2304, 1728] + - [14, 23165.0] + - - [1024, 1024, 48, 96, 1024, 1024, 1024, 96] + - [11, 20856.0] + - - [2304, 8192, 1, 12672, 2304, 2304, 2304, 12672] + - [1, 23035.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [26, 22904.0] + - - [576, 8192, 1, 2304, 576, 576, 576, 2304] + - [5, 20191.0] + - - [2304, 8192, 1, 1728, 2304, 2304, 2304, 1728] + - [14, 23425.0] + - - [1024, 1024, 16, 96, 1024, 1024, 1024, 96] + - [3, 19233.0] + - - [3072, 4096, 1, 6400, 3072, 3072, 3072, 6400] + - [16, 22701.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 1536, 3072] + - [3, 21850.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 3072, 1536] + - [11, 22580.0] + - - [384, 4096, 1, 3072, 384, 384, 384, 3072] + - [11, 20341.0] + - - [3072, 4096, 1, 1152, 3072, 3072, 3072, 1152] + - [14, 22882.0] + - - [1024, 1024, 32, 96, 1024, 1024, 1024, 96] + - [11, 20439.0] + - - [3072, 8192, 1, 6400, 3072, 3072, 3072, 6400] + - [3, 22986.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 1536, 3072] + - [21, 22642.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 3072, 1536] + - [16, 22852.0] + - - [384, 8192, 1, 3072, 384, 384, 384, 3072] + - [11, 20952.0] + - - [3072, 8192, 1, 1152, 3072, 3072, 3072, 1152] + - [14, 23363.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 22464.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [3, 22487.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 4096, 2048] + - [3, 22702.0] + - - [1024, 2283, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20167.0] + - - [1024, 2296, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20298.0] + - - [1024, 2306, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20239.0] + - - [1024, 2309, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20259.0] + - - [1024, 2318, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20345.0] + - - [1024, 2320, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20359.0] + - - [1024, 2324, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20401.0] + - - [1024, 2325, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20404.0] + - - [1024, 2329, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20435.0] + - - [1024, 2338, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20508.0] + - - [1024, 2345, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20582.0] + - - [1024, 2350, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20621.0] + - - [1024, 2362, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20721.0] + - - [1024, 2366, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20768.0] + - - [1024, 2368, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20807.0] + - - [1024, 2374, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20852.0] + - - [1024, 2390, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20970.0] + - - [512, 512, 320, 64, 512, 512, 512, 64] + - [0, 17504.0] + - - [512, 512, 80, 64, 512, 512, 512, 64] + - [26, 18915.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [5, 22072.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 2560, 4096] + - [11, 22061.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 2560] + - [3, 21836.0] + - - [1024, 1024, 512, 64, 1024, 1024, 1024, 64] + - [6, 17267.0] + - - [1024, 32768, 1, 3072, 1024, 1024, 1024, 3072] + - [21, 23038.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 23036.0] + - - [1024, 32768, 1, 50304, 1024, 1024, 1024, 50304] + - [11, 22864.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 23003.0] + - - [1024, 1024, 24, 128, 1024, 1024, 1024, 128] + - [11, 20852.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [11, 20702.0] + - - [1024, 780, 1, 30522, 1024, 1024, 1024, 30522] + - [35, 17858.0] + - - [1024, 308, 1, 30522, 1024, 1024, 1024, 30522] + - [34, 16875.0] + - - [1024, 800, 1, 30522, 1024, 1024, 1024, 30522] + - [36, 18280.0] + - - [1024, 820, 1, 30522, 1024, 1024, 1024, 30522] + - [35, 18755.0] + - - [1024, 385, 1, 30522, 1024, 1024, 1024, 30522] + - [35, 14784.0] + - - [1024, 462, 1, 30522, 1024, 1024, 1024, 30522] + - [35, 17615.0] + - - [1024, 640, 1, 30528, 1024, 1024, 1024, 30528] + - [35, 21355.0] + - - [2048, 199, 1, 29000, 2048, 2048, 2048, 29000] + - [36, 15217.0] + - - [2048, 221, 1, 29000, 2048, 2048, 2048, 29000] + - [35, 16835.0] + - - [2048, 224, 1, 29000, 2048, 2048, 2048, 29000] + - [35, 17051.0] + - - [2048, 229, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 17539.0] + - - [2048, 234, 1, 29000, 2048, 2048, 2048, 29000] + - [36, 17952.0] + - - [2048, 242, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 18360.0] + - - [2048, 246, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 18608.0] + - - [2048, 247, 1, 29000, 2048, 2048, 2048, 29000] + - [36, 18729.0] + - - [2048, 256, 1, 29000, 2048, 2048, 2048, 29000] + - [35, 19460.0] + - - [2048, 262, 1, 29000, 2048, 2048, 2048, 29000] + - [34, 14717.0] + - - [2048, 264, 1, 29000, 2048, 2048, 2048, 29000] + - [39, 14822.0] + - - [2048, 265, 1, 29000, 2048, 2048, 2048, 29000] + - [41, 14874.0] + - - [2048, 274, 1, 29000, 2048, 2048, 2048, 29000] + - [34, 15378.0] + - - [2048, 277, 1, 29000, 2048, 2048, 2048, 29000] + - [34, 15520.0] + - - [2048, 279, 1, 29000, 2048, 2048, 2048, 29000] + - [34, 15635.0] + - - [2048, 288, 1, 29000, 2048, 2048, 2048, 29000] + - [34, 16100.0] + - - [2048, 296, 1, 29000, 2048, 2048, 2048, 29000] + - [41, 16579.0] + - - [2048, 315, 1, 29000, 2048, 2048, 2048, 29000] + - [34, 17573.0] + - - [2048, 335, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 18166.0] + - - [1024, 561, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 18838.0] + - - [1024, 574, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 19167.0] + - - [1024, 600, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20025.0] + - - [1024, 608, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20218.0] + - - [1024, 615, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20387.0] + - - [1024, 622, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20684.0] + - - [1024, 625, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20685.0] + - - [1024, 626, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20720.0] + - - [1024, 628, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 20794.0] + - - [1024, 636, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 21104.0] + - - [1024, 651, 1, 29000, 1024, 1024, 1024, 29000] + - [40, 17715.0] + - - [1024, 658, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 17855.0] + - - [1024, 669, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 18148.0] + - - [1024, 670, 1, 29000, 1024, 1024, 1024, 29000] + - [40, 18137.0] + - - [1024, 672, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 18200.0] + - - [1024, 684, 1, 29000, 1024, 1024, 1024, 29000] + - [35, 18541.0] + - - [1024, 716, 1, 29000, 1024, 1024, 1024, 29000] + - [37, 19456.0] + - - [1024, 730, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 19866.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [58, 16293.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [65, 16772.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [58, 16820.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [46, 15670.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [57, 12352.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [46, 11740.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [64, 15413.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [46, 16159.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [70, 14246.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [46, 13107.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [58, 13053.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [58, 16264.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [74, 16941.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [45, 12470.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [52, 9325.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [71, 14378.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [58, 8162.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [48, 10751.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [74, 12903.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [59, 17488.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [68, 7914.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [58, 12831.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [57, 7225.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [47, 13851.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [70, 8134.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [44, 14669.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [45, 11443.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [58, 14577.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [46, 12174.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [59, 16599.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [57, 14049.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [71, 17504.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [52, 15295.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [52, 11759.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [47, 14890.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [44, 9787.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [69, 13614.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [47, 11392.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [71, 13240.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [46, 12562.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [47, 15389.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [51, 7572.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [59, 14709.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [45, 9568.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [51, 15412.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [46, 11213.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [68, 15347.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [46, 15155.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [64, 15990.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [46, 12089.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [53, 15761.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [70, 15365.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [52, 15154.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [46, 15203.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [57, 11881.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [71, 12732.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [69, 7947.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [58, 15832.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [51, 15435.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [49, 10408.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [69, 13143.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [52, 15283.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [46, 13097.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [58, 12905.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [52, 13593.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [59, 12301.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [51, 8524.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [51, 11760.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [57, 13084.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [47, 15263.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [46, 15587.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [64, 12058.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [71, 15891.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [44, 12863.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [51, 16233.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [71, 15141.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [47, 15328.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [46, 11330.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [64, 7653.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [52, 14499.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [64, 12722.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [70, 15994.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [44, 12029.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [45, 7656.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [76, 17602.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [56, 16365.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [44, 9558.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [71, 13561.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [74, 8360.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [58, 15310.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [64, 16043.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [70, 14867.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [45, 13065.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [58, 14658.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [69, 12636.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [47, 11922.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [74, 12124.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [76, 14191.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [64, 15318.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [71, 17859.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [45, 7646.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [44, 15947.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [64, 12737.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [46, 13146.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [46, 14472.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [58, 13655.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [47, 10674.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [59, 16673.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [69, 12138.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [69, 11242.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [57, 9043.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [70, 12156.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [70, 13222.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [70, 16732.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [46, 12059.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [58, 13450.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [70, 15193.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [45, 13024.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [58, 15594.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [45, 9218.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [56, 14378.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [47, 15305.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [53, 16275.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [44, 7781.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [64, 15841.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [46, 16191.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [46, 14163.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [46, 11584.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [71, 16709.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [58, 15206.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [59, 16753.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [64, 11779.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [70, 16321.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [51, 10643.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [57, 13222.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [46, 12662.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [71, 17215.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [45, 12754.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [71, 16326.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [47, 15694.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [56, 9919.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [44, 12047.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [46, 15259.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [46, 15165.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [59, 15003.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [65, 12461.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [46, 12049.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [59, 14914.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [50, 9458.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [52, 11240.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [71, 15961.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [59, 11543.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [76, 13846.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [69, 13055.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [45, 11506.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [70, 15688.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [63, 13758.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [74, 14787.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [53, 16169.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [66, 12379.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [59, 14971.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [70, 16446.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [46, 12947.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [46, 16688.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [71, 18512.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [70, 15917.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [46, 15414.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [73, 15697.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [70, 16006.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [58, 16954.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [58, 17184.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [58, 16332.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [43, 12023.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [56, 16743.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [58, 16239.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [69, 15410.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [77, 11844.0] + - - [512, 1600, 1, 32, 512, 512, 512, 32] + - [55, 7009.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [46, 16397.0] + - - [560, 1600, 1, 1024, 560, 560, 560, 1024] + - [46, 13649.0] + - - [1024, 512, 1, 1, 1024, 1024, 1024, 1] + - [55, 85.0] + - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] + - [56, 5131.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [59, 13422.0] + - - [1024, 960, 1, 64, 1024, 1024, 1024, 64] + - [72, 7884.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [52, 16448.0] + - - [1600, 512, 1, 1024, 1600, 1600, 1600, 1024] + - [53, 14599.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 1] + - [61, 187.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [46, 15891.0] + - - [64, 192, 64, 1280, 64, 64, 64, 1280] + - [63, 12653.0] + - - [64, 320, 64, 1280, 64, 64, 64, 1280] + - [60, 11385.0] + - - [64, 384, 64, 1280, 64, 64, 64, 1280] + - [54, 12038.0] + - - [64, 448, 64, 1280, 64, 64, 64, 1280] + - [66, 8699.0] + - - [64, 192, 64, 2048, 64, 64, 64, 2048] + - [51, 11569.0] + - - [64, 320, 64, 2048, 64, 64, 64, 2048] + - [77, 10737.0] + - - [64, 384, 64, 2048, 64, 64, 64, 2048] + - [77, 11938.0] + - - [64, 448, 64, 2048, 64, 64, 64, 2048] + - [54, 11617.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 192] + - [67, 17567.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 256] + - [70, 16778.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 288] + - [58, 16921.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 64] + - [43, 9952.0] + - - [64, 192, 32, 1280, 64, 64, 64, 1280] + - [63, 11518.0] + - - [64, 320, 32, 1280, 64, 64, 64, 1280] + - [63, 13898.0] + - - [64, 384, 32, 1280, 64, 64, 64, 1280] + - [63, 13138.0] + - - [64, 448, 32, 1280, 64, 64, 64, 1280] + - [63, 11603.0] + - - [64, 192, 32, 2048, 64, 64, 64, 2048] + - [74, 11320.0] + - - [64, 320, 32, 2048, 64, 64, 64, 2048] + - [63, 12988.0] + - - [64, 384, 32, 2048, 64, 64, 64, 2048] + - [51, 12656.0] + - - [64, 448, 32, 2048, 64, 64, 64, 2048] + - [74, 11634.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 192] + - [70, 13677.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 256] + - [58, 14472.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 288] + - [70, 16369.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 64] + - [75, 9378.0] + - - [289, 128, 32, 768, 289, 289, 289, 768] + - [47, 12808.0] + - - [289, 160, 32, 768, 289, 289, 289, 768] + - [45, 10548.0] + - - [289, 192, 32, 768, 289, 289, 289, 768] + - [45, 12566.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [56, 13981.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 256] + - [58, 17333.0] + - - [196, 256, 32, 1024, 196, 196, 196, 1024] + - [53, 13322.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [46, 14405.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [51, 10061.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [77, 12065.0] + - - [1024, 512, 1, 2, 1024, 1024, 1024, 2] + - [68, 352.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 4096] + - [47, 14988.0] + - - [1024, 616, 1, 1024, 1024, 1024, 1024, 1024] + - [76, 15198.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [45, 9093.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [45, 9743.0] + - - [1024, 1024, 1, 2, 1024, 1024, 1024, 2] + - [55, 357.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 4096] + - [58, 16183.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [57, 11832.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [59, 14195.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [69, 12210.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [76, 14458.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [57, 12013.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [71, 14799.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [66, 11419.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [62, 11736.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [54, 11467.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [69, 12221.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [51, 9545.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [77, 8790.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [65, 14182.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [54, 12780.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [77, 12350.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [77, 12622.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [54, 12564.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [76, 14414.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [54, 10598.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [57, 16632.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [59, 16242.0] + - - [1024, 864, 1, 512, 1024, 1024, 1024, 512] + - [71, 15099.0] + - - [256, 3456, 1, 128, 256, 256, 256, 128] + - [47, 13418.0] + - - [256, 4096, 1, 128, 256, 256, 256, 128] + - [44, 13752.0] + - - [480, 864, 1, 1024, 480, 480, 480, 1024] + - [46, 13083.0] + - - [512, 864, 1, 256, 512, 512, 512, 256] + - [44, 10166.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [45, 15252.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [45, 14259.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [66, 12252.0] + - - [256, 4096, 1, 1, 256, 256, 256, 1] + - [50, 286.0] + - - [12544, 64, 1, 147, 12544, 12544, 12544, 147] + - [56, 12555.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [57, 11788.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [45, 13925.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [45, 12489.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [57, 17325.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [54, 11197.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [57, 13172.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [74, 16024.0] + - - [3400, 256, 1, 1024, 3400, 3400, 3400, 1024] + - [47, 16294.0] + - - [3800, 256, 1, 1024, 3800, 3800, 3800, 1024] + - [47, 17955.0] + - - [864, 512, 2, 2048, 864, 864, 864, 2048] + - [53, 16222.0] + - - [888, 512, 2, 2048, 888, 888, 888, 2048] + - [53, 16619.0] + - - [51520, 64, 2, 256, 51520, 51520, 51520, 256] + - [69, 17538.0] + - - [46464, 64, 2, 256, 46464, 46464, 46464, 256] + - [75, 17679.0] + - - [49152, 64, 2, 256, 49152, 49152, 49152, 256] + - [64, 17781.0] + - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] + - [46, 16468.0] + - - [1728, 512, 1, 1024, 1728, 1728, 1728, 1024] + - [47, 16472.0] + - - [1024, 1024, 1, 320, 1024, 1024, 1024, 320] + - [46, 13607.0] + - - [51520, 64, 2, 64, 51520, 51520, 51520, 64] + - [67, 14564.0] + - - [55296, 64, 2, 64, 55296, 55296, 55296, 64] + - [58, 14774.0] + - - [49152, 64, 2, 64, 49152, 49152, 49152, 64] + - [46, 14360.0] + - - [54400, 64, 2, 64, 54400, 54400, 54400, 64] + - [73, 15703.0] + - - [42240, 64, 2, 256, 42240, 42240, 42240, 256] + - [52, 17105.0] + - - [672, 512, 2, 2048, 672, 672, 672, 2048] + - [52, 14099.0] + - - [54400, 64, 2, 256, 54400, 54400, 54400, 256] + - [64, 17230.0] + - - [56832, 64, 2, 256, 56832, 56832, 56832, 256] + - [70, 17227.0] + - - [55296, 64, 2, 256, 55296, 55296, 55296, 256] + - [46, 17319.0] + - - [60800, 64, 2, 64, 60800, 60800, 60800, 64] + - [52, 16482.0] + - - [768, 512, 2, 2048, 768, 768, 768, 2048] + - [64, 16213.0] + - - [43008, 64, 2, 256, 43008, 43008, 43008, 256] + - [46, 17020.0] + - - [864, 256, 2, 2048, 864, 864, 864, 2048] + - [52, 14165.0] + - - [768, 256, 2, 2048, 768, 768, 768, 2048] + - [75, 12840.0] + - - [45632, 64, 2, 256, 45632, 45632, 45632, 256] + - [46, 16607.0] + - - [60800, 64, 2, 256, 60800, 60800, 60800, 256] + - [75, 17868.0] + - - [1024, 1024, 1, 81, 1024, 1024, 1024, 81] + - [72, 11447.0] + - - [950, 512, 2, 2048, 950, 950, 950, 2048] + - [52, 14995.0] + - - [850, 512, 2, 2048, 850, 850, 850, 2048] + - [53, 15927.0] + - - [805, 512, 2, 2048, 805, 805, 805, 2048] + - [53, 15057.0] + - - [950, 256, 2, 2048, 950, 950, 950, 2048] + - [74, 13257.0] + - - [1900, 512, 1, 1024, 1900, 1900, 1900, 1024] + - [53, 17046.0] + - - [1700, 512, 1, 1024, 1700, 1700, 1700, 1024] + - [47, 15320.0] + - - [1610, 512, 1, 1024, 1610, 1610, 1610, 1024] + - [53, 14731.0] + - - [660, 512, 2, 2048, 660, 660, 660, 2048] + - [52, 13808.0] + - - [726, 512, 2, 2048, 726, 726, 726, 2048] + - [52, 15207.0] + - - [713, 512, 2, 2048, 713, 713, 713, 2048] + - [52, 14893.0] + - - [805, 256, 2, 2048, 805, 805, 805, 2048] + - [46, 13132.0] + - - [850, 256, 2, 2048, 850, 850, 850, 2048] + - [75, 14413.0] + - - [100, 128, 120, 512, 100, 100, 100, 512] + - [76, 12859.0] + - - [100, 128, 139, 512, 100, 100, 100, 512] + - [53, 13141.0] + - - [100, 128, 160, 512, 100, 100, 100, 512] + - [76, 13320.0] + - - [22500, 64, 1, 147, 22500, 22500, 22500, 147] + - [70, 11567.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [53, 14498.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [76, 14425.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [76, 14331.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [53, 14318.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [53, 13791.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [53, 13950.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [52, 13071.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [53, 14123.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [77, 12448.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [54, 12802.0] + - - [1024, 77, 1, 30522, 1024, 1024, 1024, 30522] + - [78, 8903.0] + - - [1024, 200, 1, 30522, 1024, 1024, 1024, 30522] + - [84, 11111.0] + - - [1024, 160, 1, 30522, 1024, 1024, 1024, 30522] + - [79, 11800.0] + - - [1024, 180, 1, 30522, 1024, 1024, 1024, 30522] + - [84, 12894.0] + - - [1024, 160, 1, 30528, 1024, 1024, 1024, 30528] + - [82, 11885.0] + - - [1024, 240, 1, 30528, 1024, 1024, 1024, 30528] + - [80, 13068.0] + - - [2560, 109, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 15723.0] + - - [2560, 121, 1, 29000, 2560, 2560, 2560, 29000] + - [83, 17338.0] + - - [2560, 65, 1, 29000, 2560, 2560, 2560, 29000] + - [83, 9596.0] + - - [2560, 66, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 9741.0] + - - [2560, 67, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 9896.0] + - - [2560, 69, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 10167.0] + - - [2560, 70, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 10346.0] + - - [2560, 71, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 10460.0] + - - [2560, 73, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 10749.0] + - - [2560, 74, 1, 29000, 2560, 2560, 2560, 29000] + - [81, 10887.0] + - - [2560, 75, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 11048.0] + - - [2560, 77, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 11322.0] + - - [2560, 78, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 11456.0] + - - [2560, 80, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 11732.0] + - - [2560, 81, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 11860.0] + - - [2560, 82, 1, 29000, 2560, 2560, 2560, 29000] + - [83, 11999.0] + - - [2560, 83, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 12160.0] + - - [2560, 84, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 12273.0] + - - [2560, 88, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 12824.0] + - - [2560, 89, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 13012.0] + - - [2560, 90, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 13097.0] + - - [2560, 92, 1, 29000, 2560, 2560, 2560, 29000] + - [80, 13373.0] + - - [2560, 95, 1, 29000, 2560, 2560, 2560, 29000] + - [83, 13794.0] + - - [2560, 98, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 14248.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [115, 7537.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [111, 7684.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [94, 5158.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [111, 8244.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [104, 5310.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [100, 8546.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [111, 5992.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [111, 5525.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [136, 6621.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [106, 6788.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [106, 3745.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [126, 5907.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [106, 6013.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [94, 7414.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [113, 5315.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [89, 9023.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [128, 3754.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [126, 8939.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [126, 7563.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [111, 8359.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [106, 5860.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [126, 4982.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [92, 4993.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [94, 7466.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [119, 6874.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [117, 4778.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [94, 5855.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [111, 8258.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [131, 6418.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [131, 3911.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [94, 3949.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [86, 2950.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [94, 6536.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [89, 8629.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [100, 5519.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [106, 3909.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [104, 8427.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [89, 8247.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [126, 8180.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [131, 7978.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [119, 8646.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [106, 5045.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [113, 5325.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [131, 7476.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [111, 4972.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [124, 6583.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [89, 7549.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [94, 7834.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [111, 8319.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [111, 7725.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [111, 8053.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [106, 2980.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [119, 4409.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [128, 6556.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [94, 6963.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [89, 6184.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [100, 8215.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [94, 5850.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [111, 5243.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [126, 8287.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [111, 8164.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [111, 8708.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [89, 8580.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [94, 7567.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [94, 6728.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [94, 3848.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [131, 7821.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [89, 5757.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [115, 7664.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [111, 5037.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [128, 2913.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [115, 5887.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [89, 5737.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [100, 6098.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [89, 8271.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [111, 5928.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [131, 8730.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [128, 2898.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [89, 8424.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [126, 7446.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [111, 8860.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [106, 6107.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [124, 6332.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [111, 8524.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [106, 6781.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [89, 7822.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [111, 7772.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [94, 6555.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [126, 8158.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [126, 8662.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [89, 9467.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [115, 7866.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [128, 5200.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [111, 8397.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [126, 7434.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [94, 6272.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [111, 8619.0] + - - [512, 512, 1, 64, 512, 512, 512, 64] + - [119, 3095.0] + - - [32, 33, 1600, 33, 32, 32, 32, 33] + - [87, 3212.0] + - - [256, 1024, 1, 1, 256, 256, 256, 1] + - [111, 58.0] + - - [257, 1024, 1, 4096, 257, 257, 257, 4096] + - [119, 7432.0] + - - [512, 200, 1, 1, 512, 512, 512, 1] + - [90, 24.0] + - - [512, 200, 1, 32, 512, 512, 512, 32] + - [90, 731.0] + - - [512, 215, 1, 2048, 512, 512, 512, 2048] + - [124, 6718.0] + - - [512, 256, 1, 2048, 512, 512, 512, 2048] + - [119, 6714.0] + - - [560, 200, 1, 1024, 560, 560, 560, 1024] + - [131, 5828.0] + - - [768, 215, 1, 2048, 768, 768, 768, 2048] + - [111, 7186.0] + - - [768, 256, 1, 2048, 768, 768, 768, 2048] + - [111, 8459.0] + - - [1024, 200, 1, 1, 1024, 1024, 1024, 1] + - [111, 94.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [126, 9710.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [111, 5244.0] + - - [64, 35, 4608, 32, 64, 64, 64, 32] + - [126, 5617.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [111, 5341.0] + - - [64, 33, 1920, 27, 64, 64, 64, 27] + - [89, 4651.0] + - - [64, 33, 1920, 33, 64, 64, 64, 33] + - [126, 4752.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 192] + - [89, 9798.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 192] + - [131, 7555.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 256] + - [119, 7594.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 288] + - [126, 7592.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 192] + - [111, 9350.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 192] + - [119, 7382.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 256] + - [131, 7354.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 288] + - [89, 7421.0] + - - [49, 2048, 64, 512, 49, 49, 49, 512] + - [106, 7076.0] + - - [49, 512, 64, 2048, 49, 49, 49, 2048] + - [124, 6950.0] + - - [49, 2048, 32, 512, 49, 49, 49, 512] + - [102, 7054.0] + - - [49, 512, 32, 2048, 49, 49, 49, 2048] + - [136, 6579.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 1024] + - [119, 7788.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 2048] + - [124, 7072.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 1024] + - [131, 7491.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 2048] + - [106, 6946.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [126, 7472.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [111, 8812.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [126, 8234.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [94, 8097.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [111, 7217.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [111, 8402.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [111, 7379.0] + - - [256, 864, 1, 128, 256, 256, 256, 128] + - [111, 4056.0] + - - [3136, 64, 1, 576, 3136, 3136, 3136, 576] + - [111, 7189.0] + - - [784, 128, 1, 1152, 784, 784, 784, 1152] + - [134, 5723.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [94, 5795.0] + - - [1024, 128, 1, 2, 1024, 1024, 1024, 2] + - [86, 137.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [94, 5430.0] + - - [1024, 96, 1, 2, 1024, 1024, 1024, 2] + - [86, 106.0] + - - [49, 2048, 128, 512, 49, 49, 49, 512] + - [100, 7178.0] + - - [49, 2048, 256, 512, 49, 49, 49, 512] + - [100, 7699.0] + - - [49, 512, 128, 2048, 49, 49, 49, 2048] + - [124, 7078.0] + - - [49, 512, 256, 2048, 49, 49, 49, 2048] + - [106, 7139.0] + - - [100, 128, 18, 512, 100, 100, 100, 512] + - [94, 5771.0] + - - [100, 128, 19, 512, 100, 100, 100, 512] + - [106, 6062.0] + - - [1444, 128, 1, 576, 1444, 1444, 1444, 576] + - [89, 7681.0] + - - [361, 512, 1, 2304, 361, 361, 361, 2304] + - [111, 8158.0] + - - [2560, 35, 1, 29000, 2560, 2560, 2560, 29000] + - [106, 4700.0] + - - [2560, 36, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 4809.0] + - - [2560, 39, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 5222.0] + - - [2560, 40, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 5392.0] + - - [2560, 42, 1, 29000, 2560, 2560, 2560, 29000] + - [106, 5659.0] + - - [2560, 43, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 5810.0] + - - [2560, 44, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 5930.0] + - - [2560, 46, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 6189.0] + - - [2560, 48, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 6463.0] + - - [2560, 49, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 6605.0] + - - [2560, 50, 1, 29000, 2560, 2560, 2560, 29000] + - [106, 6747.0] + - - [2560, 51, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 6851.0] + - - [2560, 53, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 7158.0] + - - [2560, 54, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 7299.0] + - - [2560, 55, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 7446.0] + - - [2560, 56, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 7559.0] + - - [2560, 57, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 7703.0] + - - [2560, 58, 1, 29000, 2560, 2560, 2560, 29000] + - [106, 7829.0] + - - [2560, 59, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 7990.0] + - - [2560, 61, 1, 29000, 2560, 2560, 2560, 29000] + - [94, 8216.0] + - - [2560, 63, 1, 29000, 2560, 2560, 2560, 29000] + - [106, 8495.0] + - - [1909283, 40, 1, 40, 1909283, 1909283, 1909283, 40] + - [126, 6086.0] + - - [3818566, 40, 1, 40, 3818566, 3818566, 3818566, 40] + - [119, 6141.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [144, 4766.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [146, 2968.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [163, 4384.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [149, 3852.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [161, 75.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [151, 6137.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [164, 1477.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [154, 100.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [162, 676.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [149, 1117.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [137, 3432.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [164, 2401.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [163, 1045.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [156, 4898.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [141, 5143.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [157, 519.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [155, 663.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [152, 572.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [145, 406.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [165, 2632.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [157, 817.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [164, 4329.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [156, 6211.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [141, 5335.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [160, 6383.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [159, 279.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [162, 1330.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [157, 1631.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [149, 483.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [138, 2.0] + - - [2560, 4, 1, 2, 2560, 2560, 2560, 2] + - [138, 5.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [149, 1227.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 512] + - [145, 133.0] + - - [12288, 12, 2, 256, 12288, 12288, 12288, 256] + - [147, 4108.0] + - - [12288, 3, 2, 256, 12288, 12288, 12288, 256] + - [139, 1434.0] + - - [51520, 12, 2, 256, 51520, 51520, 51520, 256] + - [147, 5415.0] + - - [51520, 3, 2, 256, 51520, 51520, 51520, 256] + - [165, 2501.0] + - - [15200, 12, 2, 256, 15200, 15200, 15200, 256] + - [153, 4430.0] + - - [15200, 3, 2, 256, 15200, 15200, 15200, 256] + - [158, 1604.0] + - - [3456, 3, 2, 256, 3456, 3456, 3456, 256] + - [142, 750.0] + - - [13600, 12, 2, 256, 13600, 13600, 13600, 256] + - [143, 4255.0] + - - [12880, 3, 2, 256, 12880, 12880, 12880, 256] + - [158, 1411.0] + - - [3400, 3, 2, 256, 3400, 3400, 3400, 256] + - [154, 740.0] + - - [12880, 12, 2, 256, 12880, 12880, 12880, 256] + - [141, 4205.0] + - - [13824, 12, 2, 256, 13824, 13824, 13824, 256] + - [141, 4290.0] + - - [13824, 3, 2, 256, 13824, 13824, 13824, 256] + - [139, 1552.0] + - - [13600, 3, 2, 256, 13600, 13600, 13600, 256] + - [150, 1503.0] + - - [3456, 12, 2, 256, 3456, 3456, 3456, 256] + - [151, 2407.0] + - - [3800, 3, 2, 256, 3800, 3800, 3800, 256] + - [161, 817.0] + - - [3400, 12, 2, 256, 3400, 3400, 3400, 256] + - [110, 2352.0] + - - [3800, 12, 2, 256, 3800, 3800, 3800, 256] + - [141, 2617.0] + - - [55296, 3, 2, 256, 55296, 55296, 55296, 256] + - [143, 2380.0] + - - [3072, 3, 2, 256, 3072, 3072, 3072, 256] + - [149, 678.0] + - - [3072, 12, 2, 256, 3072, 3072, 3072, 256] + - [139, 2868.0] + - - [54400, 3, 2, 256, 54400, 54400, 54400, 256] + - [148, 2523.0] + - - [60800, 12, 2, 256, 60800, 60800, 60800, 256] + - [151, 5476.0] + - - [60800, 3, 2, 256, 60800, 60800, 60800, 256] + - [155, 2348.0] + - - [3220, 3, 2, 256, 3220, 3220, 3220, 256] + - [140, 981.0] + - - [3220, 12, 2, 256, 3220, 3220, 3220, 256] + - [141, 2264.0] + - - [2048, 8, 1, 2, 2048, 2048, 2048, 2] + - [138, 8.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [157, 1955.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [116, 5.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [149, 605.0] + - - [2560, 27, 1, 29000, 2560, 2560, 2560, 29000] + - [164, 5531.0] + - - [1909283, 11, 1, 11, 1909283, 1909283, 1909283, 11] + - [150, 2368.0] + - - [3818566, 11, 1, 11, 3818566, 3818566, 3818566, 11] + - [158, 2165.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [170, 3363.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [167, 427.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [171, 4482.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [169, 1136.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [174, 1702.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [167, 216.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [167, 860.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [172, 285.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [166, 571.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [173, 2264.0] + - - [49, 512, 1, 4608, 49, 49, 49, 4608] + - [168, 3911.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [116, 3175.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [106, 5691.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [105, 4149.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [86, 1559.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [94, 5396.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [136, 4911.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [115, 5333.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [97, 3475.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [109, 621.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [114, 4181.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [103, 402.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [127, 3591.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [105, 2238.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [97, 3516.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [109, 1243.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [116, 2406.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [93, 2376.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [109, 952.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [106, 2330.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [105, 1814.0] + - - [1, 1, 1, 1280, 1, 1, 1, 1280] + - [90, 0.08] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [97, 2365.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [114, 2191.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [105, 4101.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [130, 3506.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [115, 4650.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [135, 2937.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [105, 1292.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [97, 1836.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [109, 958.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [123, 191.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [135, 2893.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [135, 3489.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [103, 744.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [97, 1808.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [123, 1451.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [97, 1799.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [135, 1819.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [124, 4333.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [108, 479.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [116, 1238.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [135, 1288.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [105, 2317.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [132, 0.00023696682450230245] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [106, 4166.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [128, 5030.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [103, 374.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [116, 2342.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [106, 4718.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [135, 3464.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [105, 768.0] + - - [1, 1, 1, 256, 1, 1, 1, 256] + - [112, 0.06] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [94, 4766.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [97, 738.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [97, 1140.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [137, 892.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [96, 13.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [96, 7.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [116, 139.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [109, 35.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [97, 277.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [116, 145.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [98, 1978.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [109, 70.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [97, 70.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [137, 1165.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [96, 13.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [86, 3.0] + - - [64, 14, 1, 15, 64, 64, 64, 15] + - [103, 4.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [98, 4.0] + - - [64, 15, 1, 17, 64, 64, 64, 17] + - [86, 4.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [99, 5.0] + - - [64, 17, 1, 21, 64, 64, 64, 21] + - [86, 5.0] + - - [64, 21, 1, 21, 64, 64, 64, 21] + - [91, 7.0] + - - [64, 24, 1, 24, 64, 64, 64, 24] + - [86, 9.0] + - - [64, 24, 1, 34, 64, 64, 64, 34] + - [86, 12.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [86, 13.0] + - - [64, 31, 1, 30, 64, 64, 64, 30] + - [86, 14.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [125, 15.0] + - - [64, 32, 1, 32, 64, 64, 64, 32] + - [95, 17.0] + - - [64, 34, 1, 34, 64, 64, 64, 34] + - [112, 18.0] + - - [64, 35, 1, 32, 64, 64, 64, 32] + - [88, 17.0] + - - [64, 35, 1, 35, 64, 64, 64, 35] + - [114, 19.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [123, 2118.0] + - - [1024, 4, 1, 2, 1024, 1024, 1024, 2] + - [86, 2.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 414.0] + - - [1024, 32, 1, 2, 1024, 1024, 1024, 2] + - [87, 17.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [97, 2593.0] + - - [32, 200, 1, 1, 32, 32, 32, 1] + - [86, 2.0] + - - [64, 3, 512, 3, 64, 64, 64, 3] + - [131, 82.0] + - - [64, 5, 512, 5, 64, 64, 64, 5] + - [86, 138.0] + - - [64, 5, 960, 5, 64, 64, 64, 5] + - [109, 224.0] + - - [64, 9, 512, 9, 64, 64, 64, 9] + - [125, 542.0] + - - [64, 512, 1, 1, 64, 64, 64, 1] + - [88, 8.0] + - - [67, 512, 1, 2048, 67, 67, 67, 2048] + - [135, 2894.0] + - - [74, 512, 1, 2048, 74, 74, 74, 2048] + - [135, 3228.0] + - - [74, 960, 1, 2048, 74, 74, 74, 2048] + - [106, 4333.0] + - - [100, 512, 1, 2048, 100, 100, 100, 2048] + - [106, 4161.0] + - - [128, 27, 32768, 27, 128, 128, 128, 27] + - [131, 4019.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [111, 2858.0] + - - [64, 14, 10880, 15, 64, 64, 64, 15] + - [111, 2902.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [111, 3055.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [126, 3154.0] + - - [64, 15, 7680, 17, 64, 64, 64, 17] + - [119, 3360.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [87, 3336.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [87, 3385.0] + - - [64, 17, 6144, 21, 64, 64, 64, 21] + - [126, 3968.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [100, 4924.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [111, 6024.0] + - - [64, 24, 4736, 34, 64, 64, 64, 34] + - [111, 7148.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [128, 6568.0] + - - [64, 31, 2048, 30, 64, 64, 64, 30] + - [128, 6787.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [128, 6861.0] + - - [64, 27, 1920, 27, 64, 64, 64, 27] + - [128, 5627.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [97, 966.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [94, 4923.0] + - - [1024, 10, 1, 2, 1024, 1024, 1024, 2] + - [91, 12.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 986.0] + - - [1024, 39, 1, 2, 1024, 1024, 1024, 2] + - [86, 44.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [127, 2904.0] + - - [1024, 40, 1, 2, 1024, 1024, 1024, 2] + - [86, 46.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [114, 2992.0] + - - [1024, 41, 1, 2, 1024, 1024, 1024, 2] + - [90, 21.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [93, 3023.0] + - - [1024, 5, 1, 2, 1024, 1024, 1024, 2] + - [86, 6.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [97, 485.0] + - - [1024, 6, 1, 2, 1024, 1024, 1024, 2] + - [86, 3.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 595.0] + - - [1024, 8, 1, 2, 1024, 1024, 1024, 2] + - [86, 4.0] + - - [1024, 9, 1, 2, 1024, 1024, 1024, 2] + - [116, 5.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [125, 892.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [111, 326.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [89, 331.0] + - - [128, 128, 1, 64, 128, 128, 128, 64] + - [93, 541.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [105, 444.0] + - - [64, 5, 1, 5, 64, 64, 64, 5] + - [98, 0.41] + - - [32, 33, 1, 33, 32, 32, 32, 33] + - [86, 8.0] + - - [1024, 16, 1, 2, 1024, 1024, 1024, 2] + - [98, 9.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 1592.0] + - - [1024, 1, 1, 2, 1024, 1024, 1024, 2] + - [90, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [137, 99.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 200] + - [123, 37.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1600] + - [97, 116.0] + - - [1024, 64, 1, 2, 1024, 1024, 1024, 2] + - [101, 35.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [115, 4137.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [94, 4420.0] + - - [1024, 80, 1, 2, 1024, 1024, 1024, 2] + - [98, 42.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [115, 4540.0] + - - [1024, 82, 1, 2, 1024, 1024, 1024, 2] + - [98, 43.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 1224.0] + - - [1024, 12, 1, 2, 1024, 1024, 1024, 2] + - [107, 7.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [100, 5729.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [131, 7287.0] + - - [196, 256, 1, 2304, 196, 196, 196, 2304] + - [106, 4566.0] + - - [768, 3, 2, 256, 768, 768, 768, 256] + - [105, 322.0] + - - [768, 12, 2, 256, 768, 768, 768, 256] + - [135, 786.0] + - - [864, 12, 2, 256, 864, 864, 864, 256] + - [135, 865.0] + - - [864, 3, 2, 256, 864, 864, 864, 256] + - [133, 214.0] + - - [216, 3, 2, 256, 216, 216, 216, 256] + - [135, 58.0] + - - [176, 12, 2, 256, 176, 176, 176, 256] + - [121, 195.0] + - - [176, 3, 2, 256, 176, 176, 176, 256] + - [121, 46.0] + - - [192, 12, 2, 256, 192, 192, 192, 256] + - [121, 201.0] + - - [192, 3, 2, 256, 192, 192, 192, 256] + - [135, 53.0] + - - [216, 12, 2, 256, 216, 216, 216, 256] + - [103, 230.0] + - - [850, 3, 2, 256, 850, 850, 850, 256] + - [105, 210.0] + - - [850, 12, 2, 256, 850, 850, 850, 256] + - [97, 837.0] + - - [805, 12, 2, 256, 805, 805, 805, 256] + - [123, 808.0] + - - [805, 3, 2, 256, 805, 805, 805, 256] + - [120, 236.0] + - - [247, 3, 2, 256, 247, 247, 247, 256] + - [129, 99.0] + - - [950, 3, 2, 256, 950, 950, 950, 256] + - [103, 372.0] + - - [187, 12, 2, 256, 187, 187, 187, 256] + - [121, 326.0] + - - [247, 12, 2, 256, 247, 247, 247, 256] + - [133, 253.0] + - - [187, 3, 2, 256, 187, 187, 187, 256] + - [133, 48.0] + - - [228, 12, 2, 256, 228, 228, 228, 256] + - [121, 240.0] + - - [221, 12, 2, 256, 221, 221, 221, 256] + - [122, 262.0] + - - [950, 12, 2, 256, 950, 950, 950, 256] + - [135, 915.0] + - - [228, 3, 2, 256, 228, 228, 228, 256] + - [121, 60.0] + - - [221, 3, 2, 256, 221, 221, 221, 256] + - [137, 91.0] + - - [25, 128, 120, 256, 25, 25, 25, 256] + - [122, 4201.0] + - - [25, 128, 139, 256, 25, 25, 25, 256] + - [105, 4004.0] + - - [25, 128, 160, 256, 25, 25, 25, 256] + - [114, 4011.0] + - - [25, 128, 18, 256, 25, 25, 25, 256] + - [118, 2371.0] + - - [25, 128, 19, 256, 25, 25, 25, 256] + - [105, 1871.0] + - - [9, 128, 120, 256, 9, 9, 9, 256] + - [133, 1629.0] + - - [9, 128, 139, 256, 9, 9, 9, 256] + - [121, 1804.0] + - - [9, 128, 160, 256, 9, 9, 9, 256] + - [103, 1819.0] + - - [9, 128, 18, 256, 9, 9, 9, 256] + - [133, 769.0] + - - [9, 128, 19, 256, 9, 9, 9, 256] + - [133, 805.0] + - - [100, 512, 1, 2304, 100, 100, 100, 2304] + - [106, 4216.0] + - - [25, 256, 1, 1152, 25, 25, 25, 1152] + - [109, 674.0] + - - [9, 256, 1, 1152, 9, 9, 9, 1152] + - [137, 250.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [116, 1589.0] + - - [1024, 20, 1, 2, 1024, 1024, 1024, 2] + - [108, 22.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH_GB.yaml new file mode 100644 index 000000000..93c85fb48 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HBH_GB.yaml @@ -0,0 +1,46330 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x32_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bljk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [10, 21955.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [3, 20910.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 21650.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [10, 21828.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [16, 19134.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [5, 22304.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 21916.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 21034.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [30, 17388.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 21993.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [16, 17388.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22449.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [5, 22143.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [16, 21594.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [2, 17070.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 20043.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [33, 19853.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [3, 19573.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [10, 22581.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 20208.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [28, 22859.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [3, 20780.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [3, 20415.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 22784.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [16, 21199.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 20792.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [10, 21756.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22990.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 21593.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22359.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [22, 17887.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [28, 22397.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 21665.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [22, 22960.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [10, 18432.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [3, 22080.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [10, 20972.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 21899.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [16, 17274.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [10, 22399.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [16, 16792.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 20590.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 20217.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [15, 16167.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [28, 20610.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [3, 18378.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 18960.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [28, 15950.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [16, 14447.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [16, 17360.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 19980.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [10, 19203.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [10, 18097.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [33, 17196.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [3, 17694.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [10, 16549.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [22, 19591.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 21824.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [33, 21714.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 21749.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [33, 22444.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [28, 22094.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [28, 18554.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 22890.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [10, 22276.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 22072.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [33, 22194.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 23079.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 21798.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 22006.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [10, 18797.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [28, 18246.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [10, 17536.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [5, 20424.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [22, 21841.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22293.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 22461.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [16, 19519.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [3, 18399.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 22161.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 21905.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [3, 20548.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 23065.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [22, 21835.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [3, 18030.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [3, 19965.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 22684.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [16, 16456.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [16, 22029.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [3, 21431.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [5, 22088.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [30, 21694.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22017.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [3, 21068.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 22423.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 21114.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21078.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [28, 19333.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [5, 18522.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 21830.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [30, 20779.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 21854.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [33, 21574.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [10, 20470.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 19014.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [30, 20681.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [33, 20136.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 22760.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [10, 16685.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [28, 22273.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [16, 15080.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 22331.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 21646.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 21982.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [28, 21451.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [22, 18524.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 21851.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [16, 21116.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21553.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [3, 18181.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [22, 22290.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [16, 20310.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [28, 22401.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [3, 21417.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 21718.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [2, 16534.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22500.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [16, 17192.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [33, 21966.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [28, 21611.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22519.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [28, 21602.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [22, 21801.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [33, 22111.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [3, 17332.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 22429.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [22, 21926.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 22044.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [28, 18430.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [22, 18079.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [16, 22111.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [10, 20245.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [16, 16800.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 22145.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 22027.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [22, 21087.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 21643.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 22169.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [28, 19167.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [5, 21216.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [3, 20569.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [16, 18836.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 18354.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [28, 21791.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [27, 17713.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [5, 18364.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [5, 19014.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [9, 15012.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [22, 21113.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 20554.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 21953.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [28, 21213.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [9, 15771.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [16, 20574.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22667.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [33, 22594.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [3, 22397.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 22198.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [33, 19052.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22994.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22464.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 19326.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [10, 18062.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [10, 19631.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 22128.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [16, 17541.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [15, 15731.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 21652.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [10, 19614.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 22888.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [3, 20855.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [10, 18252.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 20001.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [10, 21853.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [22, 20306.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 22064.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [16, 19572.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [28, 18706.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22768.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22794.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [10, 20595.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [22, 22283.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 21404.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [3, 20176.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 22333.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [22, 21474.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [10, 20992.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [22, 20716.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [33, 22506.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [10, 20573.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [27, 16843.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 22759.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [28, 21656.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [3, 19898.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [10, 22332.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [3, 20785.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [33, 21996.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [16, 21122.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [33, 22554.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [16, 20887.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21856.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [33, 22007.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [10, 22176.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 21112.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [3, 18693.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 21174.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 21020.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [3, 14830.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 22669.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 21687.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 19689.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21687.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [28, 19461.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [3, 18691.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 18033.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 21597.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [10, 17766.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [22, 21284.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [3, 18330.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [3, 21782.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [22, 21604.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [3, 22029.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [10, 20374.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [33, 21504.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 21537.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [5, 22455.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [28, 17196.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 19586.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 21497.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 21743.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [16, 18587.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [3, 19603.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [28, 20633.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [3, 20657.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [28, 18491.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [10, 19588.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 22113.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 19310.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [33, 19479.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [16, 19364.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 21901.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [5, 21946.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 19375.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 20005.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [13, 15451.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [3, 21944.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [5, 20426.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [3, 20988.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [10, 17252.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [3, 19681.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [3, 21443.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 22392.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [3, 20432.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [16, 20600.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 22324.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 19324.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22699.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [2, 17506.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [10, 22762.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [10, 18311.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [5, 22169.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [0, 16873.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [33, 18372.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [10, 19681.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [10, 21623.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [3, 17382.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 21757.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [33, 19193.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 22674.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 21509.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 20628.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [10, 19699.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [3, 19190.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [28, 22265.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [22, 22017.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 18834.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 22324.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [11, 20491.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [16, 19848.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 18792.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [16, 14392.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 18649.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 22804.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [10, 22077.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 19598.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 22195.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [4, 16326.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [5, 22644.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [3, 18243.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [16, 17957.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [33, 14160.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22167.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [3, 21187.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [33, 17749.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [5, 20299.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 21960.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 22011.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [3, 15139.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 23046.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [5, 20185.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 21485.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 21766.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [3, 19929.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 21679.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [3, 17794.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21934.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 22273.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [2, 14295.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [16, 20712.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [22, 20015.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [3, 20709.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [15, 15624.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [3, 22578.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [22, 22504.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [28, 21252.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [28, 21662.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [16, 21782.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [3, 17726.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 21868.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 18812.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [10, 21562.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [10, 18784.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [3, 22814.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [10, 19009.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [16, 18476.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [24, 16383.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 20823.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [33, 20806.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [10, 16958.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [22, 21898.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 22901.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [3, 22298.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [10, 21759.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [10, 20268.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 21395.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [3, 21465.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [16, 21116.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [22, 19924.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [3, 19422.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [3, 19351.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 20779.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [16, 21766.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 21315.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 22076.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 20450.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [22, 22386.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [28, 21975.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [22, 21240.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [10, 19614.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [10, 20677.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 22888.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [3, 17549.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [10, 18115.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [16, 19329.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [16, 21644.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 19640.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [10, 22568.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [9, 16720.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 21510.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [10, 20393.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [33, 20054.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [5, 20187.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 22560.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 20486.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [16, 18436.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [22, 18315.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [3, 19050.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [16, 19768.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [22, 15407.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [3, 22924.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 21924.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 19740.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [22, 21310.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [16, 17435.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [10, 21240.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [3, 23087.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [16, 18302.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [33, 22120.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [14, 22870.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 19184.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [33, 22294.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [3, 22943.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [16, 20705.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [33, 19978.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [22, 22237.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [28, 22797.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [22, 22566.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [10, 16215.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [16, 22887.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [22, 21619.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [10, 22715.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [16, 21989.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [33, 22188.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [3, 18816.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [3, 23006.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 19358.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [10, 23083.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [22, 19630.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [20, 18209.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 22337.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [10, 22820.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [10, 22958.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [3, 22971.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [22, 22988.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [3, 23021.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 19247.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 20245.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 22220.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [28, 22548.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [10, 22888.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [26, 22544.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [22, 22337.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [22, 22583.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [22, 22266.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [10, 19329.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [16, 20579.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [3, 22280.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [10, 22891.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [28, 19620.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [10, 22655.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [3, 22949.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [10, 20475.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [22, 22475.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [22, 22337.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 22767.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [10, 22649.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [3, 22397.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [3, 18823.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [33, 19906.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [9, 17630.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [3, 22999.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [22, 21326.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [10, 19803.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [10, 22282.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [16, 15920.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [14, 21264.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [5, 22934.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [10, 19204.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [22, 22761.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [16, 20309.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [16, 22512.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [22, 17096.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [3, 21071.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 20612.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [5, 20814.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [10, 21462.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [10, 21311.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [3, 21535.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [10, 22749.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [22, 22388.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [10, 22605.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [3, 21171.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [22, 20707.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [16, 21977.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [10, 22942.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 21884.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [10, 19413.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [11, 22415.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [33, 22850.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [3, 21901.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [10, 22461.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [14, 18154.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 17334.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [10, 21557.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [16, 18858.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [16, 18694.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [3, 20266.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [3, 21467.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [10, 20812.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [7, 12561.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [10, 18625.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [16, 18443.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [3, 19185.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 17722.0] + - - [128, 128, 512, 64, 128, 128, 128, 64] + - [25, 16368.0] + - - [512, 512, 64, 64, 512, 512, 512, 64] + - [33, 19085.0] + - - [1024, 2048, 1, 2, 1024, 1024, 1024, 2] + - [19, 738.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 19767.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 20048.0] + - - [1024, 2048, 1, 30528, 1024, 1024, 1024, 30528] + - [3, 20366.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 21663.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21937.0] + - - [1024, 4096, 1, 30528, 1024, 1024, 1024, 30528] + - [33, 22313.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22271.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22615.0] + - - [256, 8976, 1, 1536, 256, 256, 256, 1536] + - [22, 19120.0] + - - [256, 8976, 1, 2048, 256, 256, 256, 2048] + - [22, 18921.0] + - - [256, 8976, 1, 2304, 256, 256, 256, 2304] + - [28, 19073.0] + - - [256, 8976, 1, 2560, 256, 256, 256, 2560] + - [10, 18980.0] + - - [256, 8976, 1, 2816, 256, 256, 256, 2816] + - [5, 19377.0] + - - [256, 8976, 1, 3072, 256, 256, 256, 3072] + - [22, 19133.0] + - - [256, 8976, 1, 4352, 256, 256, 256, 4352] + - [10, 19405.0] + - - [256, 8976, 1, 4864, 256, 256, 256, 4864] + - [16, 19517.0] + - - [256, 8976, 1, 5376, 256, 256, 256, 5376] + - [33, 19562.0] + - - [256, 8976, 1, 5632, 256, 256, 256, 5632] + - [22, 19484.0] + - - [256, 8976, 1, 5888, 256, 256, 256, 5888] + - [10, 19601.0] + - - [256, 8976, 1, 6144, 256, 256, 256, 6144] + - [10, 19576.0] + - - [256, 8976, 1, 6656, 256, 256, 256, 6656] + - [10, 19590.0] + - - [256, 8976, 1, 7168, 256, 256, 256, 7168] + - [33, 19592.0] + - - [256, 8976, 1, 7424, 256, 256, 256, 7424] + - [33, 19610.0] + - - [256, 8976, 1, 8192, 256, 256, 256, 8192] + - [22, 19498.0] + - - [256, 8976, 1, 8448, 256, 256, 256, 8448] + - [33, 19642.0] + - - [256, 8976, 1, 8960, 256, 256, 256, 8960] + - [28, 19619.0] + - - [256, 8976, 1, 9472, 256, 256, 256, 9472] + - [28, 19595.0] + - - [256, 8976, 1, 9728, 256, 256, 256, 9728] + - [22, 19658.0] + - - [256, 8976, 1, 9984, 256, 256, 256, 9984] + - [28, 19670.0] + - - [256, 8976, 1, 10240, 256, 256, 256, 10240] + - [10, 19481.0] + - - [256, 8976, 1, 10496, 256, 256, 256, 10496] + - [16, 19649.0] + - - [256, 8976, 1, 11008, 256, 256, 256, 11008] + - [16, 19648.0] + - - [256, 8976, 1, 11520, 256, 256, 256, 11520] + - [28, 19624.0] + - - [256, 8976, 1, 12288, 256, 256, 256, 12288] + - [10, 19463.0] + - - [256, 8976, 1, 14336, 256, 256, 256, 14336] + - [22, 19205.0] + - - [256, 8976, 1, 14848, 256, 256, 256, 14848] + - [22, 19712.0] + - - [256, 8976, 1, 15104, 256, 256, 256, 15104] + - [22, 19693.0] + - - [256, 8976, 1, 15872, 256, 256, 256, 15872] + - [22, 19633.0] + - - [256, 8976, 1, 17152, 256, 256, 256, 17152] + - [16, 19689.0] + - - [256, 8976, 1, 19712, 256, 256, 256, 19712] + - [22, 19695.0] + - - [256, 8976, 1, 19968, 256, 256, 256, 19968] + - [22, 19705.0] + - - [256, 8976, 1, 20480, 256, 256, 256, 20480] + - [22, 18912.0] + - - [256, 8976, 1, 20992, 256, 256, 256, 20992] + - [10, 19654.0] + - - [256, 8976, 1, 22016, 256, 256, 256, 22016] + - [10, 19559.0] + - - [256, 8976, 1, 26112, 256, 256, 256, 26112] + - [22, 19539.0] + - - [256, 8976, 1, 33536, 256, 256, 256, 33536] + - [10, 19700.0] + - - [256, 8976, 1, 44505, 256, 256, 256, 44505] + - [18, 19867.0] + - - [256, 32768, 1, 128, 256, 256, 256, 128] + - [1, 20351.0] + - - [480, 32768, 1, 1024, 480, 480, 480, 1024] + - [33, 21063.0] + - - [512, 32768, 1, 256, 512, 512, 512, 256] + - [28, 21837.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1] + - [19, 353.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 17772.0] + - - [1024, 1792, 1, 256, 1024, 1024, 1024, 256] + - [3, 18152.0] + - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] + - [10, 18089.0] + - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] + - [10, 19890.0] + - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] + - [10, 19748.0] + - - [1024, 3328, 1, 256, 1024, 1024, 1024, 256] + - [10, 19694.0] + - - [1024, 3840, 1, 256, 1024, 1024, 1024, 256] + - [10, 20730.0] + - - [1024, 4096, 1, 256, 1024, 1024, 1024, 256] + - [10, 20641.0] + - - [1024, 4608, 1, 256, 1024, 1024, 1024, 256] + - [10, 20377.0] + - - [1024, 4864, 1, 256, 1024, 1024, 1024, 256] + - [22, 20278.0] + - - [1024, 5120, 1, 256, 1024, 1024, 1024, 256] + - [10, 21084.0] + - - [1024, 5632, 1, 256, 1024, 1024, 1024, 256] + - [10, 20936.0] + - - [1024, 6144, 1, 256, 1024, 1024, 1024, 256] + - [3, 20707.0] + - - [1024, 6400, 1, 256, 1024, 1024, 1024, 256] + - [10, 21361.0] + - - [1024, 7168, 1, 256, 1024, 1024, 1024, 256] + - [10, 21061.0] + - - [1024, 7424, 1, 256, 1024, 1024, 1024, 256] + - [10, 21053.0] + - - [1024, 7680, 1, 256, 1024, 1024, 1024, 256] + - [10, 21555.0] + - - [1024, 7936, 1, 256, 1024, 1024, 1024, 256] + - [3, 21478.0] + - - [1024, 8192, 1, 256, 1024, 1024, 1024, 256] + - [10, 21394.0] + - - [1024, 8448, 1, 256, 1024, 1024, 1024, 256] + - [3, 21294.0] + - - [1024, 8704, 1, 256, 1024, 1024, 1024, 256] + - [10, 21209.0] + - - [1024, 8960, 1, 256, 1024, 1024, 1024, 256] + - [3, 21716.0] + - - [1024, 9728, 1, 256, 1024, 1024, 1024, 256] + - [10, 21462.0] + - - [1024, 9984, 1, 256, 1024, 1024, 1024, 256] + - [10, 21379.0] + - - [1024, 10240, 1, 256, 1024, 1024, 1024, 256] + - [10, 21792.0] + - - [1024, 10496, 1, 256, 1024, 1024, 1024, 256] + - [3, 21775.0] + - - [1024, 11008, 1, 256, 1024, 1024, 1024, 256] + - [3, 21612.0] + - - [1024, 11264, 1, 256, 1024, 1024, 1024, 256] + - [3, 21528.0] + - - [1024, 11520, 1, 256, 1024, 1024, 1024, 256] + - [10, 21880.0] + - - [1024, 12288, 1, 256, 1024, 1024, 1024, 256] + - [10, 21686.0] + - - [1024, 13312, 1, 256, 1024, 1024, 1024, 256] + - [3, 21846.0] + - - [1024, 13568, 1, 256, 1024, 1024, 1024, 256] + - [3, 21778.0] + - - [1024, 14336, 1, 256, 1024, 1024, 1024, 256] + - [3, 21995.0] + - - [1024, 14592, 1, 256, 1024, 1024, 1024, 256] + - [10, 21941.0] + - - [1024, 14848, 1, 256, 1024, 1024, 1024, 256] + - [10, 21862.0] + - - [1024, 15104, 1, 256, 1024, 1024, 1024, 256] + - [10, 21791.0] + - - [1024, 16128, 1, 256, 1024, 1024, 1024, 256] + - [10, 21924.0] + - - [1024, 17152, 1, 256, 1024, 1024, 1024, 256] + - [3, 21993.0] + - - [1024, 18944, 1, 256, 1024, 1024, 1024, 256] + - [10, 21956.0] + - - [1024, 19712, 1, 256, 1024, 1024, 1024, 256] + - [10, 22087.0] + - - [1024, 19968, 1, 256, 1024, 1024, 1024, 256] + - [3, 22060.0] + - - [1024, 20480, 1, 256, 1024, 1024, 1024, 256] + - [3, 22212.0] + - - [1024, 20992, 1, 256, 1024, 1024, 1024, 256] + - [3, 22168.0] + - - [1024, 21504, 1, 256, 1024, 1024, 1024, 256] + - [3, 22062.0] + - - [1024, 22016, 1, 256, 1024, 1024, 1024, 256] + - [3, 22216.0] + - - [1024, 23552, 1, 256, 1024, 1024, 1024, 256] + - [10, 22178.0] + - - [1024, 28672, 1, 256, 1024, 1024, 1024, 256] + - [3, 22322.0] + - - [1024, 32768, 1, 512, 1024, 1024, 1024, 512] + - [3, 22596.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 22788.0] + - - [1024, 33536, 1, 256, 1024, 1024, 1024, 256] + - [16, 22344.0] + - - [1024, 40448, 1, 256, 1024, 1024, 1024, 256] + - [16, 22381.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 18624.0] + - - [2048, 1024, 1, 1, 2048, 2048, 2048, 1] + - [4, 289.0] + - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] + - [10, 18199.0] + - - [3200, 1024, 1, 2048, 3200, 3200, 3200, 2048] + - [3, 22132.0] + - - [4096, 1024, 1, 1, 4096, 4096, 4096, 1] + - [7, 339.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 21620.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 1024, 4096] + - [24, 22193.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 20682.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 21265.0] + - - [1024, 3968, 1, 42720, 1024, 1024, 1024, 42720] + - [3, 21719.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 21721.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21943.0] + - - [1024, 6528, 1, 42720, 1024, 1024, 1024, 42720] + - [10, 22118.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 21577.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 21866.0] + - - [1024, 7104, 1, 42720, 1024, 1024, 1024, 42720] + - [14, 22019.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 21590.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 22102.0] + - - [1024, 7200, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 22200.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 21888.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 22254.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22099.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 22427.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 22419.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22578.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22279.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 22502.0] + - - [1024, 9520, 1, 42720, 1024, 1024, 1024, 42720] + - [14, 22446.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 22153.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22344.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 22211.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22382.0] + - - [1024, 10080, 1, 42720, 1024, 1024, 1024, 42720] + - [28, 22359.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 22382.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22494.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22717.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 4096, 1024] + - [10, 22678.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22835.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22645.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22531.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22901.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22765.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22888.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22684.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22745.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22792.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22745.0] + - - [1024, 3240, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 19859.0] + - - [1024, 3240, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 20324.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 20331.0] + - - [1024, 3960, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 21232.0] + - - [1024, 3960, 1, 42720, 1024, 1024, 1024, 42720] + - [33, 21657.0] + - - [4096, 3240, 1, 1024, 4096, 4096, 4096, 1024] + - [16, 21932.0] + - - [4096, 3960, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22536.0] + - - [289, 128, 64, 768, 289, 289, 289, 768] + - [28, 15443.0] + - - [289, 160, 64, 768, 289, 289, 289, 768] + - [2, 11289.0] + - - [289, 192, 64, 768, 289, 289, 289, 768] + - [15, 13294.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 64] + - [26, 18838.0] + - - [784, 512, 32, 128, 784, 784, 784, 128] + - [14, 17969.0] + - - [784, 128, 32, 512, 784, 784, 784, 512] + - [16, 17517.0] + - - [196, 1024, 32, 256, 196, 196, 196, 256] + - [3, 15421.0] + - - [3136, 128, 64, 64, 3136, 3136, 3136, 64] + - [22, 18851.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 128] + - [3, 20822.0] + - - [784, 512, 64, 256, 784, 784, 784, 256] + - [3, 19110.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 256] + - [3, 20740.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [3, 21731.0] + - - [196, 1024, 64, 512, 196, 196, 196, 512] + - [10, 16544.0] + - - [784, 256, 64, 512, 784, 784, 784, 512] + - [16, 19300.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [16, 19603.0] + - - [196, 512, 64, 1024, 196, 196, 196, 1024] + - [22, 16628.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [10, 16849.0] + - - [3136, 128, 32, 64, 3136, 3136, 3136, 64] + - [33, 17271.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 128] + - [3, 20715.0] + - - [784, 512, 32, 256, 784, 784, 784, 256] + - [3, 18851.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 256] + - [22, 21017.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [10, 21371.0] + - - [196, 1024, 32, 512, 196, 196, 196, 512] + - [3, 16321.0] + - - [784, 256, 32, 512, 784, 784, 784, 512] + - [3, 18726.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [16, 19374.0] + - - [196, 512, 32, 1024, 196, 196, 196, 1024] + - [33, 16342.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [10, 16578.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [33, 23011.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [22, 21756.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [33, 21363.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [22, 22663.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 21676.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [10, 21731.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [5, 22647.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [10, 21166.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [10, 21335.0] + - - [1024, 4096, 1, 2, 1024, 1024, 1024, 2] + - [7, 536.0] + - - [4096, 512, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 18974.0] + - - [1024, 1280, 1, 2, 1024, 1024, 1024, 2] + - [27, 394.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 19565.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 1024, 4096] + - [24, 20988.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 21334.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 21687.0] + - - [1024, 4992, 1, 2, 1024, 1024, 1024, 2] + - [21, 748.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 21177.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 21797.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22629.0] + - - [1024, 5120, 1, 2, 1024, 1024, 1024, 2] + - [7, 656.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 22106.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22446.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 4096, 1024] + - [22, 22859.0] + - - [1024, 5248, 1, 2, 1024, 1024, 1024, 2] + - [19, 640.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 21556.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 21884.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22754.0] + - - [1024, 2560, 1, 2, 1024, 1024, 1024, 2] + - [15, 561.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 21400.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 1024, 4096] + - [24, 21693.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22677.0] + - - [1024, 3072, 1, 2, 1024, 1024, 1024, 2] + - [1, 704.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 20977.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 21195.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 4096, 1024] + - [10, 22478.0] + - - [1024, 1152, 1, 2, 1024, 1024, 1024, 2] + - [32, 641.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 18578.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 18953.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 21015.0] + - - [479, 32768, 1, 1024, 479, 479, 479, 1024] + - [33, 20851.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22184.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 22555.0] + - - [1024, 8192, 1, 33712, 1024, 1024, 1024, 33712] + - [30, 22473.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 22491.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 22803.0] + - - [1024, 9600, 1, 33712, 1024, 1024, 1024, 33712] + - [28, 22723.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22871.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22906.0] + - - [1024, 1024, 64, 64, 1024, 1024, 1024, 64] + - [2, 17851.0] + - - [1024, 16384, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 22754.0] + - - [1024, 2048, 1, 30592, 1024, 1024, 1024, 30592] + - [16, 20420.0] + - - [640, 2048, 1, 2560, 640, 640, 640, 2560] + - [5, 21229.0] + - - [1024, 1024, 64, 96, 1024, 1024, 1024, 96] + - [3, 20809.0] + - - [1536, 4096, 1, 4608, 1536, 1536, 1536, 4608] + - [10, 21983.0] + - - [512, 512, 256, 64, 512, 512, 512, 64] + - [9, 16134.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 2048, 8192] + - [10, 20121.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22941.0] + - - [1024, 8192, 1, 50304, 1024, 1024, 1024, 50304] + - [5, 22353.0] + - - [1536, 8192, 1, 50304, 1536, 1536, 1536, 50304] + - [33, 22162.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 6144, 1536] + - [10, 22998.0] + - - [1024, 4096, 1, 30592, 1024, 1024, 1024, 30592] + - [22, 22339.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 1536, 6144] + - [3, 21906.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22253.0] + - - [1024, 16384, 1, 50304, 1024, 1024, 1024, 50304] + - [22, 22384.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 3072] + - [3, 21948.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [22, 22589.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 20033.0] + - - [2560, 2048, 1, 7680, 2560, 2560, 2560, 7680] + - [5, 22563.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 19574.0] + - - [2048, 1024, 1, 30592, 2048, 2048, 2048, 30592] + - [10, 20311.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 8192, 2048] + - [3, 22454.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [5, 22587.0] + - - [1536, 8192, 1, 4608, 1536, 1536, 1536, 4608] + - [22, 22646.0] + - - [1024, 2048, 1, 50304, 1024, 1024, 1024, 50304] + - [3, 20349.0] + - - [1024, 1024, 32, 64, 1024, 1024, 1024, 64] + - [22, 19368.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 1536, 6144] + - [10, 22626.0] + - - [1024, 1024, 256, 64, 1024, 1024, 1024, 64] + - [12, 17252.0] + - - [512, 512, 40, 64, 512, 512, 512, 64] + - [14, 17605.0] + - - [1536, 4096, 1, 50304, 1536, 1536, 1536, 50304] + - [5, 21938.0] + - - [1024, 1024, 128, 96, 1024, 1024, 1024, 96] + - [10, 21110.0] + - - [1024, 8192, 1, 3072, 1024, 1024, 1024, 3072] + - [3, 22502.0] + - - [1024, 1024, 128, 64, 1024, 1024, 1024, 64] + - [15, 17230.0] + - - [1024, 4096, 1, 50304, 1024, 1024, 1024, 50304] + - [10, 22017.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 6144, 1536] + - [3, 22889.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 22537.0] + - - [2560, 2048, 1, 1920, 2560, 2560, 2560, 1920] + - [5, 22545.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 2048, 6144] + - [3, 20085.0] + - - [512, 512, 128, 64, 512, 512, 512, 64] + - [29, 18399.0] + - - [1024, 8192, 1, 30592, 1024, 1024, 1024, 30592] + - [3, 22590.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [3, 21673.0] + - - [128, 128, 1024, 64, 128, 128, 128, 64] + - [36, 15450.0] + - - [1024, 8192, 1, 30528, 1024, 1024, 1024, 30528] + - [30, 22435.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 21284.0] + - - [1024, 3456, 1, 512, 1024, 1024, 1024, 512] + - [3, 21059.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 512] + - [3, 20701.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 21642.0] + - - [1024, 6912, 1, 512, 1024, 1024, 1024, 512] + - [16, 21635.0] + - - [256, 55296, 1, 128, 256, 256, 256, 128] + - [3, 20732.0] + - - [256, 6912, 1, 128, 256, 256, 256, 128] + - [1, 15928.0] + - - [480, 3456, 1, 1024, 480, 480, 480, 1024] + - [10, 17728.0] + - - [480, 4096, 1, 1024, 480, 480, 480, 1024] + - [33, 17464.0] + - - [480, 6912, 1, 1024, 480, 480, 480, 1024] + - [22, 19405.0] + - - [512, 3456, 1, 256, 512, 512, 512, 256] + - [10, 15599.0] + - - [512, 4096, 1, 256, 512, 512, 512, 256] + - [10, 16220.0] + - - [512, 55296, 1, 256, 512, 512, 512, 256] + - [3, 22117.0] + - - [512, 6912, 1, 256, 512, 512, 512, 256] + - [3, 19991.0] + - - [1024, 1280, 1, 30528, 1024, 1024, 1024, 30528] + - [24, 21778.0] + - - [1024, 1600, 1, 30528, 1024, 1024, 1024, 30528] + - [18, 18450.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 22512.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 22726.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22871.0] + - - [128, 128, 1280, 64, 128, 128, 128, 64] + - [22, 18631.0] + - - [1024, 1640, 1, 30528, 1024, 1024, 1024, 30528] + - [3, 18997.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22539.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22698.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22887.0] + - - [128, 128, 1312, 64, 128, 128, 128, 64] + - [10, 18506.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 21887.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22872.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 21590.0] + - - [512, 512, 192, 64, 512, 512, 512, 64] + - [21, 17651.0] + - - [256, 6912, 1, 1, 256, 256, 256, 1] + - [9, 330.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22406.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 22408.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22423.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 1024, 4096] + - [33, 22700.0] + - - [1024, 10224, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 22706.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22739.0] + - - [1024, 10240, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 22797.0] + - - [1024, 10192, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 22665.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22752.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22494.0] + - - [1024, 10200, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 22648.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22349.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22767.0] + - - [1024, 10208, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 22704.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22571.0] + - - [1024, 10224, 1, 2048, 1024, 1024, 1024, 2048] + - [10, 22647.0] + - - [1024, 10240, 1, 2048, 1024, 1024, 1024, 2048] + - [10, 22777.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 22300.0] + - - [1024, 10192, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 22631.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 22315.0] + - - [1024, 10080, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 22507.0] + - - [100352, 512, 1, 256, 100352, 100352, 100352, 256] + - [3, 22371.0] + - - [12544, 2048, 1, 1024, 12544, 12544, 12544, 1024] + - [3, 22812.0] + - - [200704, 512, 1, 256, 200704, 200704, 200704, 256] + - [28, 22448.0] + - - [25088, 1024, 1, 512, 25088, 25088, 25088, 512] + - [10, 22606.0] + - - [50176, 1024, 1, 512, 50176, 50176, 50176, 512] + - [10, 22810.0] + - - [6272, 2048, 1, 1024, 6272, 6272, 6272, 1024] + - [16, 22379.0] + - - [3136, 128, 128, 256, 3136, 3136, 3136, 256] + - [3, 21113.0] + - - [3136, 128, 256, 256, 3136, 3136, 3136, 256] + - [3, 21344.0] + - - [784, 256, 128, 512, 784, 784, 784, 512] + - [3, 19588.0] + - - [784, 256, 256, 512, 784, 784, 784, 512] + - [3, 19739.0] + - - [128, 128, 2048, 64, 128, 128, 128, 64] + - [0, 15361.0] + - - [1024, 2560, 1, 30528, 1024, 1024, 1024, 30528] + - [18, 22507.0] + - - [128, 128, 1536, 64, 128, 128, 128, 64] + - [17, 17880.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 22675.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [10, 22478.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22948.0] + - - [1024, 1920, 1, 30528, 1024, 1024, 1024, 30528] + - [22, 21951.0] + - - [128, 128, 192, 64, 128, 128, 128, 64] + - [25, 13827.0] + - - [768, 2048, 1, 2, 768, 768, 768, 2] + - [0, 705.0] + - - [3072, 2048, 1, 768, 3072, 3072, 3072, 768] + - [16, 21550.0] + - - [768, 2048, 1, 3072, 768, 768, 768, 3072] + - [10, 20363.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [28, 18857.0] + - - [384, 384, 144, 64, 384, 384, 384, 64] + - [33, 19298.0] + - - [768, 4608, 1, 2, 768, 768, 768, 2] + - [9, 889.0] + - - [3072, 4608, 1, 768, 3072, 3072, 3072, 768] + - [22, 22422.0] + - - [768, 4608, 1, 3072, 768, 768, 768, 3072] + - [22, 21878.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [16, 20920.0] + - - [512, 512, 48, 64, 512, 512, 512, 64] + - [23, 17575.0] + - - [128, 128, 256, 64, 128, 128, 128, 64] + - [19, 11732.0] + - - [384, 384, 192, 64, 384, 384, 384, 64] + - [26, 19695.0] + - - [1024, 4608, 1, 2, 1024, 1024, 1024, 2] + - [19, 867.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22777.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 1024, 4096] + - [10, 21749.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 20923.0] + - - [8192, 1024, 1, 1024, 8192, 8192, 8192, 1024] + - [10, 22262.0] + - - [8192, 4096, 1, 1024, 8192, 8192, 8192, 1024] + - [3, 22887.0] + - - [196, 1024, 128, 256, 196, 196, 196, 256] + - [16, 15800.0] + - - [196, 1024, 256, 256, 196, 196, 196, 256] + - [3, 16076.0] + - - [196, 256, 128, 1024, 196, 196, 196, 1024] + - [22, 16113.0] + - - [196, 256, 256, 1024, 196, 196, 196, 1024] + - [33, 16451.0] + - - [196, 512, 128, 1024, 196, 196, 196, 1024] + - [33, 16671.0] + - - [196, 512, 256, 1024, 196, 196, 196, 1024] + - [22, 17024.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 1024] + - [16, 19671.0] + - - [768, 2048, 2, 512, 768, 768, 768, 512] + - [10, 20460.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 1024] + - [10, 18769.0] + - - [864, 2048, 2, 512, 864, 864, 864, 512] + - [3, 19243.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 1024] + - [3, 18211.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 1024] + - [3, 17797.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 1024] + - [10, 18566.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 512] + - [10, 20603.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 512] + - [3, 20554.0] + - - [888, 2048, 2, 512, 888, 888, 888, 512] + - [3, 19744.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 512] + - [10, 19908.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 512] + - [16, 18479.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 1024] + - [28, 18546.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 1024] + - [28, 18441.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 1024] + - [33, 17229.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 512] + - [22, 21118.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 1024] + - [3, 19849.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 1024] + - [3, 17719.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 1024] + - [16, 18868.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 512] + - [3, 17895.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 1024] + - [28, 18944.0] + - - [840, 2048, 2, 512, 840, 840, 840, 512] + - [3, 18142.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 512] + - [3, 19488.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 512] + - [33, 18399.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 1024] + - [16, 16940.0] + - - [13600, 256, 2, 512, 13600, 13600, 13600, 512] + - [16, 20497.0] + - - [12880, 256, 2, 512, 12880, 12880, 12880, 512] + - [16, 20259.0] + - - [12288, 256, 2, 512, 12288, 12288, 12288, 512] + - [28, 20796.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 1024] + - [28, 17622.0] + - - [672, 2048, 2, 512, 672, 672, 672, 512] + - [22, 17012.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 512] + - [16, 20239.0] + - - [13824, 256, 2, 512, 13824, 13824, 13824, 512] + - [28, 21781.0] + - - [15200, 256, 2, 512, 15200, 15200, 15200, 512] + - [28, 21566.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 1024] + - [3, 19366.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 256] + - [3, 20156.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 512] + - [22, 20417.0] + - - [15200, 128, 1, 512, 15200, 15200, 15200, 512] + - [3, 17731.0] + - - [13600, 128, 1, 512, 13600, 13600, 13600, 512] + - [33, 16970.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 256] + - [10, 19535.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 256] + - [3, 19357.0] + - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] + - [3, 20774.0] + - - [24576, 128, 1, 256, 24576, 24576, 24576, 256] + - [3, 18138.0] + - - [24576, 512, 1, 256, 24576, 24576, 24576, 256] + - [3, 21613.0] + - - [25760, 128, 1, 256, 25760, 25760, 25760, 256] + - [10, 17914.0] + - - [25760, 512, 1, 256, 25760, 25760, 25760, 256] + - [16, 21155.0] + - - [6144, 256, 1, 512, 6144, 6144, 6144, 512] + - [3, 19293.0] + - - [6440, 256, 1, 512, 6440, 6440, 6440, 512] + - [3, 16233.0] + - - [13600, 512, 1, 128, 13600, 13600, 13600, 128] + - [20, 17313.0] + - - [9408, 512, 2, 128, 9408, 9408, 9408, 128] + - [11, 18510.0] + - - [56000, 256, 2, 64, 56000, 56000, 56000, 64] + - [29, 16111.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 256] + - [3, 20065.0] + - - [60800, 256, 1, 64, 60800, 60800, 60800, 64] + - [8, 18675.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 256] + - [3, 19947.0] + - - [11776, 512, 2, 128, 11776, 11776, 11776, 128] + - [28, 19824.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 128] + - [22, 18764.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 256] + - [33, 19820.0] + - - [54400, 256, 1, 64, 54400, 54400, 54400, 64] + - [14, 18646.0] + - - [15200, 512, 1, 128, 15200, 15200, 15200, 128] + - [10, 18509.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 256] + - [3, 20150.0] + - - [12672, 512, 2, 128, 12672, 12672, 12672, 128] + - [33, 19997.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 128] + - [3, 19419.0] + - - [46464, 256, 2, 64, 46464, 46464, 46464, 64] + - [26, 18947.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 1024] + - [24, 17737.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 1024] + - [10, 19282.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 256] + - [3, 18658.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 512] + - [33, 19179.0] + - - [45632, 256, 2, 64, 45632, 45632, 45632, 64] + - [22, 17814.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 256] + - [33, 19407.0] + - - [53760, 256, 2, 64, 53760, 53760, 53760, 64] + - [34, 16965.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 1024] + - [35, 17309.0] + - - [47872, 256, 2, 64, 47872, 47872, 47872, 64] + - [22, 19276.0] + - - [47104, 256, 2, 64, 47104, 47104, 47104, 64] + - [23, 18732.0] + - - [50688, 256, 2, 64, 50688, 50688, 50688, 64] + - [22, 19202.0] + - - [45056, 256, 2, 64, 45056, 45056, 45056, 64] + - [16, 18899.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 128] + - [16, 20137.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 256] + - [3, 18150.0] + - - [11264, 512, 2, 128, 11264, 11264, 11264, 128] + - [22, 19919.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 512] + - [3, 18499.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 128] + - [22, 20342.0] + - - [37632, 256, 2, 64, 37632, 37632, 37632, 64] + - [10, 18532.0] + - - [51520, 256, 2, 64, 51520, 51520, 51520, 64] + - [34, 18021.0] + - - [14000, 512, 2, 128, 14000, 14000, 14000, 128] + - [22, 19227.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 128] + - [20, 19594.0] + - - [64512, 256, 2, 64, 64512, 64512, 64512, 64] + - [31, 14927.0] + - - [54400, 256, 2, 64, 54400, 54400, 54400, 64] + - [3, 16420.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 256] + - [3, 19341.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 128] + - [28, 20377.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 256] + - [3, 19949.0] + - - [55296, 256, 2, 256, 55296, 55296, 55296, 256] + - [3, 22059.0] + - - [51520, 256, 2, 256, 51520, 51520, 51520, 256] + - [10, 21826.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 512] + - [28, 19937.0] + - - [60800, 256, 2, 256, 60800, 60800, 60800, 256] + - [28, 22187.0] + - - [54400, 256, 2, 256, 54400, 54400, 54400, 256] + - [3, 22156.0] + - - [60800, 256, 2, 64, 60800, 60800, 60800, 64] + - [8, 19863.0] + - - [3800, 1024, 1, 256, 3800, 3800, 3800, 256] + - [3, 19851.0] + - - [3400, 1024, 1, 256, 3400, 3400, 3400, 256] + - [10, 19410.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 256] + - [3, 20713.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 256] + - [16, 19956.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 128] + - [10, 20507.0] + - - [49152, 256, 2, 256, 49152, 49152, 49152, 256] + - [10, 21521.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 128] + - [20, 19810.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 128] + - [10, 20444.0] + - - [42240, 256, 2, 64, 42240, 42240, 42240, 64] + - [8, 19887.0] + - - [1008, 2048, 2, 512, 1008, 1008, 1008, 512] + - [3, 20512.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 256] + - [10, 20207.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 128] + - [3, 21024.0] + - - [56832, 256, 2, 64, 56832, 56832, 56832, 64] + - [33, 19242.0] + - - [43008, 256, 2, 64, 43008, 43008, 43008, 64] + - [28, 19190.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 128] + - [3, 20575.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 256] + - [10, 20248.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 128] + - [3, 20885.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 1024] + - [3, 20350.0] + - - [55296, 256, 2, 64, 55296, 55296, 55296, 64] + - [23, 17001.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 1024] + - [3, 16369.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 128] + - [16, 19464.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 256] + - [3, 19854.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 256] + - [3, 20933.0] + - - [49152, 256, 2, 64, 49152, 49152, 49152, 64] + - [3, 18228.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 256] + - [3, 20410.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 256] + - [3, 20792.0] + - - [6912, 256, 1, 512, 6912, 6912, 6912, 512] + - [10, 18711.0] + - - [6800, 256, 1, 512, 6800, 6800, 6800, 512] + - [3, 17107.0] + - - [27648, 128, 1, 256, 27648, 27648, 27648, 256] + - [10, 18945.0] + - - [27200, 128, 1, 256, 27200, 27200, 27200, 256] + - [3, 17559.0] + - - [30400, 128, 1, 256, 30400, 30400, 30400, 256] + - [10, 18945.0] + - - [7600, 256, 1, 512, 7600, 7600, 7600, 512] + - [3, 17846.0] + - - [6144, 1024, 1, 512, 6144, 6144, 6144, 512] + - [22, 20723.0] + - - [6912, 1024, 1, 512, 6912, 6912, 6912, 512] + - [22, 21747.0] + - - [6440, 1024, 1, 512, 6440, 6440, 6440, 512] + - [10, 20964.0] + - - [27648, 512, 1, 256, 27648, 27648, 27648, 256] + - [10, 21728.0] + - - [1728, 2048, 1, 1024, 1728, 1728, 1728, 1024] + - [3, 19829.0] + - - [27200, 512, 1, 256, 27200, 27200, 27200, 256] + - [10, 21341.0] + - - [6800, 1024, 1, 512, 6800, 6800, 6800, 512] + - [10, 21158.0] + - - [7600, 1024, 1, 512, 7600, 7600, 7600, 512] + - [3, 21738.0] + - - [30400, 512, 1, 256, 30400, 30400, 30400, 256] + - [10, 21508.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [3, 22399.0] + - - [173280, 128, 1, 64, 173280, 173280, 173280, 64] + - [14, 19013.0] + - - [231040, 128, 1, 64, 231040, 231040, 231040, 64] + - [3, 19736.0] + - - [25992, 128, 1, 64, 25992, 25992, 25992, 64] + - [7, 15363.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 1024] + - [10, 18275.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 1024] + - [3, 17659.0] + - - [850, 2048, 2, 512, 850, 850, 850, 512] + - [3, 18988.0] + - - [805, 2048, 2, 512, 805, 805, 805, 512] + - [16, 17842.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 1024] + - [3, 19598.0] + - - [713, 2048, 2, 512, 713, 713, 713, 512] + - [3, 17972.0] + - - [850, 2048, 1, 512, 850, 850, 850, 512] + - [3, 16779.0] + - - [660, 2048, 2, 512, 660, 660, 660, 512] + - [3, 16652.0] + - - [726, 2048, 2, 512, 726, 726, 726, 512] + - [3, 18265.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 1024] + - [3, 18339.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 1024] + - [3, 19287.0] + - - [748, 2048, 2, 512, 748, 748, 748, 512] + - [16, 18863.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 256] + - [33, 19778.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 256] + - [10, 19706.0] + - - [950, 2048, 1, 512, 950, 950, 950, 512] + - [3, 17205.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 256] + - [10, 20554.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 256] + - [3, 20214.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 256] + - [3, 20155.0] + - - [950, 2048, 2, 512, 950, 950, 950, 512] + - [16, 19521.0] + - - [1610, 2048, 1, 1024, 1610, 1610, 1610, 1024] + - [3, 19955.0] + - - [1700, 2048, 1, 1024, 1700, 1700, 1700, 1024] + - [3, 19290.0] + - - [1900, 2048, 1, 1024, 1900, 1900, 1900, 1024] + - [16, 21390.0] + - - [1444, 256, 120, 128, 1444, 1444, 1444, 128] + - [16, 19413.0] + - - [1444, 256, 139, 128, 1444, 1444, 1444, 128] + - [16, 19514.0] + - - [1444, 256, 160, 128, 1444, 1444, 1444, 128] + - [16, 19598.0] + - - [1444, 256, 18, 128, 1444, 1444, 1444, 128] + - [8, 17818.0] + - - [1444, 256, 19, 128, 1444, 1444, 1444, 128] + - [28, 18400.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [3, 20557.0] + - - [1444, 256, 139, 256, 1444, 1444, 1444, 256] + - [3, 20652.0] + - - [1444, 256, 160, 256, 1444, 1444, 1444, 256] + - [3, 20633.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [22, 19309.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [3, 19707.0] + - - [361, 256, 120, 512, 361, 361, 361, 512] + - [10, 19355.0] + - - [361, 256, 139, 512, 361, 361, 361, 512] + - [33, 19580.0] + - - [361, 256, 160, 512, 361, 361, 361, 512] + - [33, 19394.0] + - - [361, 256, 18, 512, 361, 361, 361, 512] + - [10, 16674.0] + - - [361, 256, 19, 512, 361, 361, 361, 512] + - [33, 17831.0] + - - [200716, 128, 1, 64, 200716, 200716, 200716, 64] + - [34, 18225.0] + - - [27436, 128, 1, 64, 27436, 27436, 27436, 64] + - [21, 13507.0] + - - [1024, 1024, 160, 96, 1024, 1024, 1024, 96] + - [3, 21199.0] + - - [1920, 16384, 1, 25216, 1920, 1920, 1920, 25216] + - [3, 22646.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 3840, 1920] + - [14, 23300.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 1920, 3840] + - [16, 22845.0] + - - [960, 16384, 1, 1920, 960, 960, 960, 1920] + - [20, 21153.0] + - - [1920, 16384, 1, 2880, 1920, 1920, 1920, 2880] + - [14, 23546.0] + - - [1024, 1024, 40, 96, 1024, 1024, 1024, 96] + - [16, 20581.0] + - - [1920, 4096, 1, 25216, 1920, 1920, 1920, 25216] + - [10, 22410.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 3840, 1920] + - [26, 23257.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 1920, 3840] + - [3, 22640.0] + - - [960, 4096, 1, 1920, 960, 960, 960, 1920] + - [16, 20057.0] + - - [1920, 4096, 1, 2880, 1920, 1920, 1920, 2880] + - [14, 22948.0] + - - [1024, 1024, 80, 96, 1024, 1024, 1024, 96] + - [10, 20741.0] + - - [1920, 8192, 1, 25216, 1920, 1920, 1920, 25216] + - [3, 22504.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 3840, 1920] + - [14, 23436.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 1920, 3840] + - [3, 22794.0] + - - [960, 8192, 1, 1920, 960, 960, 960, 1920] + - [14, 21180.0] + - - [1920, 8192, 1, 2880, 1920, 1920, 1920, 2880] + - [14, 23525.0] + - - [1024, 1024, 96, 96, 1024, 1024, 1024, 96] + - [3, 21023.0] + - - [2304, 16384, 1, 12672, 2304, 2304, 2304, 12672] + - [3, 22809.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [16, 22984.0] + - - [576, 16384, 1, 2304, 576, 576, 576, 2304] + - [3, 20359.0] + - - [2304, 16384, 1, 1728, 2304, 2304, 2304, 1728] + - [14, 23419.0] + - - [1024, 1024, 24, 96, 1024, 1024, 1024, 96] + - [10, 20268.0] + - - [2304, 4096, 1, 12672, 2304, 2304, 2304, 12672] + - [1, 22749.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [16, 22626.0] + - - [576, 4096, 1, 2304, 576, 576, 576, 2304] + - [5, 19649.0] + - - [2304, 4096, 1, 1728, 2304, 2304, 2304, 1728] + - [26, 23152.0] + - - [1024, 1024, 48, 96, 1024, 1024, 1024, 96] + - [10, 20950.0] + - - [2304, 8192, 1, 12672, 2304, 2304, 2304, 12672] + - [1, 23009.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [3, 22819.0] + - - [576, 8192, 1, 2304, 576, 576, 576, 2304] + - [5, 20156.0] + - - [2304, 8192, 1, 1728, 2304, 2304, 2304, 1728] + - [26, 23402.0] + - - [1024, 1024, 16, 96, 1024, 1024, 1024, 96] + - [3, 20108.0] + - - [3072, 4096, 1, 6400, 3072, 3072, 3072, 6400] + - [16, 22699.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 1536, 3072] + - [3, 21821.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 3072, 1536] + - [16, 22557.0] + - - [384, 4096, 1, 3072, 384, 384, 384, 3072] + - [10, 20457.0] + - - [3072, 4096, 1, 1152, 3072, 3072, 3072, 1152] + - [14, 22890.0] + - - [1024, 1024, 32, 96, 1024, 1024, 1024, 96] + - [3, 19818.0] + - - [3072, 8192, 1, 6400, 3072, 3072, 3072, 6400] + - [33, 22975.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 1536, 3072] + - [22, 22600.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 3072, 1536] + - [3, 22914.0] + - - [384, 8192, 1, 3072, 384, 384, 384, 3072] + - [3, 21142.0] + - - [3072, 8192, 1, 1152, 3072, 3072, 3072, 1152] + - [14, 23385.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [3, 22427.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [3, 22534.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 4096, 2048] + - [3, 22709.0] + - - [1024, 2283, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20078.0] + - - [1024, 2296, 1, 29000, 1024, 1024, 1024, 29000] + - [30, 20188.0] + - - [1024, 2306, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20154.0] + - - [1024, 2309, 1, 29000, 1024, 1024, 1024, 29000] + - [30, 20174.0] + - - [1024, 2318, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20251.0] + - - [1024, 2320, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20289.0] + - - [1024, 2324, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20308.0] + - - [1024, 2325, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20316.0] + - - [1024, 2329, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20352.0] + - - [1024, 2338, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20437.0] + - - [1024, 2345, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20485.0] + - - [1024, 2350, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20533.0] + - - [1024, 2362, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 20641.0] + - - [1024, 2366, 1, 29000, 1024, 1024, 1024, 29000] + - [10, 20699.0] + - - [1024, 2368, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 20722.0] + - - [1024, 2374, 1, 29000, 1024, 1024, 1024, 29000] + - [18, 20742.0] + - - [1024, 2390, 1, 29000, 1024, 1024, 1024, 29000] + - [16, 20892.0] + - - [512, 512, 320, 64, 512, 512, 512, 64] + - [0, 17495.0] + - - [512, 512, 80, 64, 512, 512, 512, 64] + - [16, 18883.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [5, 21904.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 2560, 4096] + - [3, 22076.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 2560] + - [22, 21953.0] + - - [1024, 1024, 512, 64, 1024, 1024, 1024, 64] + - [6, 17254.0] + - - [1024, 32768, 1, 3072, 1024, 1024, 1024, 3072] + - [10, 23020.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 23023.0] + - - [1024, 32768, 1, 50304, 1024, 1024, 1024, 50304] + - [10, 22869.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 22963.0] + - - [1024, 1024, 24, 128, 1024, 1024, 1024, 128] + - [3, 20882.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [10, 20779.0] + - - [1024, 780, 1, 30522, 1024, 1024, 1024, 30522] + - [38, 17819.0] + - - [1024, 308, 1, 30522, 1024, 1024, 1024, 30522] + - [37, 16880.0] + - - [1024, 800, 1, 30522, 1024, 1024, 1024, 30522] + - [38, 18261.0] + - - [1024, 820, 1, 30522, 1024, 1024, 1024, 30522] + - [38, 18784.0] + - - [1024, 385, 1, 30522, 1024, 1024, 1024, 30522] + - [38, 14863.0] + - - [1024, 462, 1, 30522, 1024, 1024, 1024, 30522] + - [45, 17591.0] + - - [1024, 640, 1, 30528, 1024, 1024, 1024, 30528] + - [38, 21356.0] + - - [2048, 199, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 15204.0] + - - [2048, 221, 1, 29000, 2048, 2048, 2048, 29000] + - [39, 16804.0] + - - [2048, 224, 1, 29000, 2048, 2048, 2048, 29000] + - [43, 17029.0] + - - [2048, 229, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 17395.0] + - - [2048, 234, 1, 29000, 2048, 2048, 2048, 29000] + - [41, 17801.0] + - - [2048, 242, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 18374.0] + - - [2048, 246, 1, 29000, 2048, 2048, 2048, 29000] + - [43, 18654.0] + - - [2048, 247, 1, 29000, 2048, 2048, 2048, 29000] + - [41, 18766.0] + - - [2048, 256, 1, 29000, 2048, 2048, 2048, 29000] + - [38, 19448.0] + - - [2048, 262, 1, 29000, 2048, 2048, 2048, 29000] + - [37, 14710.0] + - - [2048, 264, 1, 29000, 2048, 2048, 2048, 29000] + - [37, 14835.0] + - - [2048, 265, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 14894.0] + - - [2048, 274, 1, 29000, 2048, 2048, 2048, 29000] + - [37, 15398.0] + - - [2048, 277, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 15547.0] + - - [2048, 279, 1, 29000, 2048, 2048, 2048, 29000] + - [44, 15663.0] + - - [2048, 288, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 16111.0] + - - [2048, 296, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 16579.0] + - - [2048, 315, 1, 29000, 2048, 2048, 2048, 29000] + - [42, 17619.0] + - - [2048, 335, 1, 29000, 2048, 2048, 2048, 29000] + - [41, 18273.0] + - - [1024, 561, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 18922.0] + - - [1024, 574, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 19294.0] + - - [1024, 600, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 20078.0] + - - [1024, 608, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 20400.0] + - - [1024, 615, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 20601.0] + - - [1024, 622, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 20836.0] + - - [1024, 625, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 20903.0] + - - [1024, 626, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 20977.0] + - - [1024, 628, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 21006.0] + - - [1024, 636, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 21257.0] + - - [1024, 651, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 17847.0] + - - [1024, 658, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 18043.0] + - - [1024, 669, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 18237.0] + - - [1024, 670, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 18281.0] + - - [1024, 672, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 18351.0] + - - [1024, 684, 1, 29000, 1024, 1024, 1024, 29000] + - [40, 18608.0] + - - [1024, 716, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 19470.0] + - - [1024, 730, 1, 29000, 1024, 1024, 1024, 29000] + - [38, 19975.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [60, 16339.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [67, 16963.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [60, 16891.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [72, 15671.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [59, 12957.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [60, 11937.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [78, 14848.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [49, 16013.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [54, 14364.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [47, 13083.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [49, 13207.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [60, 16015.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [47, 15641.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [49, 11500.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [72, 9403.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [50, 14423.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [47, 8288.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [59, 10527.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [48, 12965.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [50, 17634.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [62, 8343.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [71, 13880.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [48, 7320.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [73, 13995.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [65, 8189.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [47, 14892.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [53, 11296.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [60, 14717.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [49, 12283.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [50, 16607.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [71, 14062.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [61, 17691.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [72, 15422.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [54, 11945.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [50, 15398.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [47, 9886.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [74, 15452.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [49, 11171.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [61, 13656.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [49, 12500.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [61, 15524.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [48, 7501.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [61, 14757.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [48, 9715.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [53, 15475.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [58, 11411.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [49, 15462.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [54, 15260.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [49, 15640.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [70, 12120.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [55, 15906.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [49, 15325.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [78, 15274.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [49, 15230.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [65, 11867.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [73, 12805.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [71, 7675.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [60, 15992.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [49, 15316.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [54, 10080.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [65, 12208.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [49, 15391.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [71, 13664.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [65, 16051.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [72, 13759.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [49, 12356.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [65, 8307.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [53, 11820.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [48, 13147.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [61, 15408.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [49, 15547.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [78, 9754.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [55, 15911.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [78, 11783.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [49, 16132.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [61, 15179.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [50, 15447.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [49, 11409.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [78, 7652.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [72, 14622.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [77, 13821.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [72, 15986.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [49, 12180.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [70, 8284.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [79, 17785.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [60, 16289.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [47, 9520.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [79, 15372.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [78, 10155.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [49, 15782.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [72, 15945.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [59, 17244.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [71, 13061.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [49, 14692.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [61, 12552.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [76, 12006.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [53, 12118.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [49, 14573.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [54, 15074.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [61, 17912.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [54, 7793.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [47, 16089.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [60, 13137.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [49, 14864.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [54, 15009.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [72, 13615.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [50, 10837.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [61, 16804.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [71, 11846.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [49, 10343.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [77, 7544.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [49, 12248.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [50, 13007.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [60, 16727.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [78, 13981.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [49, 13515.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [72, 15246.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [48, 13948.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [72, 15537.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [65, 9233.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [58, 14399.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [67, 15323.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [55, 16224.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [47, 7833.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [78, 15925.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [72, 16359.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [72, 14280.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [52, 11702.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [50, 16729.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [54, 15352.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [61, 16917.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [54, 11954.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [60, 16441.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [77, 12053.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [48, 13255.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [66, 12796.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [67, 17492.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [53, 12693.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [79, 16940.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [50, 15807.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [72, 11887.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [59, 12754.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [76, 15509.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [49, 15227.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [50, 14948.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [55, 12489.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [54, 12049.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [61, 15009.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [52, 9342.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [65, 12382.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [55, 16674.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [71, 11481.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [55, 13917.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [53, 13003.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [76, 11800.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [60, 15677.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [75, 12445.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [73, 12734.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [50, 15990.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [68, 12422.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [61, 15091.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [72, 16431.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [49, 13065.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [49, 16836.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [50, 18686.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [72, 15974.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [66, 15823.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [78, 16106.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [72, 15505.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [54, 14936.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [49, 17061.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [60, 16371.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [46, 11979.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [58, 16832.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [60, 16182.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [48, 14294.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [56, 12116.0] + - - [512, 1600, 1, 32, 512, 512, 512, 32] + - [57, 7123.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [60, 16422.0] + - - [560, 1600, 1, 1024, 560, 560, 560, 1024] + - [54, 13601.0] + - - [1024, 512, 1, 1, 1024, 1024, 1024, 1] + - [63, 85.0] + - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] + - [57, 5023.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [61, 13530.0] + - - [1024, 960, 1, 64, 1024, 1024, 1024, 64] + - [49, 9679.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [66, 17108.0] + - - [1600, 512, 1, 1024, 1600, 1600, 1600, 1024] + - [50, 14681.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 1] + - [81, 244.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [49, 15905.0] + - - [64, 192, 64, 1280, 64, 64, 64, 1280] + - [53, 13193.0] + - - [64, 320, 64, 1280, 64, 64, 64, 1280] + - [56, 11607.0] + - - [64, 384, 64, 1280, 64, 64, 64, 1280] + - [80, 11313.0] + - - [64, 448, 64, 1280, 64, 64, 64, 1280] + - [80, 8703.0] + - - [64, 192, 64, 2048, 64, 64, 64, 2048] + - [68, 12550.0] + - - [64, 320, 64, 2048, 64, 64, 64, 2048] + - [68, 11312.0] + - - [64, 384, 64, 2048, 64, 64, 64, 2048] + - [68, 11556.0] + - - [64, 448, 64, 2048, 64, 64, 64, 2048] + - [80, 11417.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 192] + - [69, 17033.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 256] + - [72, 16968.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 288] + - [71, 18163.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 64] + - [46, 10338.0] + - - [64, 192, 32, 1280, 64, 64, 64, 1280] + - [65, 11367.0] + - - [64, 320, 32, 1280, 64, 64, 64, 1280] + - [77, 13847.0] + - - [64, 384, 32, 1280, 64, 64, 64, 1280] + - [53, 13006.0] + - - [64, 448, 32, 1280, 64, 64, 64, 1280] + - [77, 11789.0] + - - [64, 192, 32, 2048, 64, 64, 64, 2048] + - [65, 11371.0] + - - [64, 320, 32, 2048, 64, 64, 64, 2048] + - [77, 13203.0] + - - [64, 384, 32, 2048, 64, 64, 64, 2048] + - [77, 12468.0] + - - [64, 448, 32, 2048, 64, 64, 64, 2048] + - [53, 11791.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 192] + - [72, 13395.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 256] + - [78, 15147.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 288] + - [78, 15485.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 64] + - [78, 9521.0] + - - [289, 128, 32, 768, 289, 289, 289, 768] + - [79, 12973.0] + - - [289, 160, 32, 768, 289, 289, 289, 768] + - [71, 10585.0] + - - [289, 192, 32, 768, 289, 289, 289, 768] + - [48, 12606.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [64, 14096.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 256] + - [72, 17462.0] + - - [196, 256, 32, 1024, 196, 196, 196, 1024] + - [66, 13361.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [49, 14418.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [77, 9986.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [56, 11901.0] + - - [1024, 512, 1, 2, 1024, 1024, 1024, 2] + - [70, 350.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 4096] + - [53, 15076.0] + - - [1024, 616, 1, 1024, 1024, 1024, 1024, 1024] + - [79, 15313.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [59, 10737.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [48, 11507.0] + - - [1024, 1024, 1, 2, 1024, 1024, 1024, 2] + - [57, 353.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 4096] + - [60, 16207.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [77, 14005.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [79, 14214.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [71, 13066.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [61, 15191.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [71, 13708.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [79, 15487.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [56, 11336.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [72, 12491.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [56, 11541.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [79, 11962.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [77, 10457.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [68, 8824.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [67, 14274.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [68, 12775.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [68, 12517.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [80, 11917.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [56, 12747.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [55, 14499.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [68, 11045.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [71, 15921.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [79, 16365.0] + - - [1024, 864, 1, 512, 1024, 1024, 1024, 512] + - [67, 15460.0] + - - [256, 3456, 1, 128, 256, 256, 256, 128] + - [61, 13611.0] + - - [256, 4096, 1, 128, 256, 256, 256, 128] + - [49, 13752.0] + - - [480, 864, 1, 1024, 480, 480, 480, 1024] + - [78, 13164.0] + - - [512, 864, 1, 256, 512, 512, 512, 256] + - [76, 10889.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [59, 14887.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [71, 14119.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [56, 12367.0] + - - [256, 4096, 1, 1, 256, 256, 256, 1] + - [57, 286.0] + - - [12544, 64, 1, 147, 12544, 12544, 12544, 147] + - [70, 12635.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [48, 11676.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [48, 14347.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [71, 12520.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [59, 15613.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [80, 11564.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [48, 13503.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [48, 17240.0] + - - [3400, 256, 1, 1024, 3400, 3400, 3400, 1024] + - [50, 16396.0] + - - [3800, 256, 1, 1024, 3800, 3800, 3800, 1024] + - [50, 17878.0] + - - [864, 512, 2, 2048, 864, 864, 864, 2048] + - [50, 16265.0] + - - [888, 512, 2, 2048, 888, 888, 888, 2048] + - [79, 16613.0] + - - [51520, 64, 2, 256, 51520, 51520, 51520, 256] + - [65, 18029.0] + - - [46464, 64, 2, 256, 46464, 46464, 46464, 256] + - [78, 17675.0] + - - [49152, 64, 2, 256, 49152, 49152, 49152, 256] + - [48, 17892.0] + - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] + - [49, 16584.0] + - - [1728, 512, 1, 1024, 1728, 1728, 1728, 1024] + - [50, 16508.0] + - - [1024, 1024, 1, 320, 1024, 1024, 1024, 320] + - [71, 15322.0] + - - [51520, 64, 2, 64, 51520, 51520, 51520, 64] + - [51, 14395.0] + - - [55296, 64, 2, 64, 55296, 55296, 55296, 64] + - [72, 16098.0] + - - [49152, 64, 2, 64, 49152, 49152, 49152, 64] + - [60, 14547.0] + - - [54400, 64, 2, 64, 54400, 54400, 54400, 64] + - [54, 15803.0] + - - [42240, 64, 2, 256, 42240, 42240, 42240, 256] + - [78, 17920.0] + - - [672, 512, 2, 2048, 672, 672, 672, 2048] + - [54, 14604.0] + - - [54400, 64, 2, 256, 54400, 54400, 54400, 256] + - [49, 17858.0] + - - [56832, 64, 2, 256, 56832, 56832, 56832, 256] + - [49, 17872.0] + - - [55296, 64, 2, 256, 55296, 55296, 55296, 256] + - [65, 18321.0] + - - [60800, 64, 2, 64, 60800, 60800, 60800, 64] + - [52, 15545.0] + - - [768, 512, 2, 2048, 768, 768, 768, 2048] + - [66, 16777.0] + - - [43008, 64, 2, 256, 43008, 43008, 43008, 256] + - [78, 17731.0] + - - [864, 256, 2, 2048, 864, 864, 864, 2048] + - [49, 14940.0] + - - [768, 256, 2, 2048, 768, 768, 768, 2048] + - [49, 13553.0] + - - [45632, 64, 2, 256, 45632, 45632, 45632, 256] + - [49, 16599.0] + - - [60800, 64, 2, 256, 60800, 60800, 60800, 256] + - [72, 17868.0] + - - [1024, 1024, 1, 81, 1024, 1024, 1024, 81] + - [75, 11235.0] + - - [950, 512, 2, 2048, 950, 950, 950, 2048] + - [54, 15007.0] + - - [850, 512, 2, 2048, 850, 850, 850, 2048] + - [79, 15942.0] + - - [805, 512, 2, 2048, 805, 805, 805, 2048] + - [67, 15130.0] + - - [950, 256, 2, 2048, 950, 950, 950, 2048] + - [65, 13194.0] + - - [1900, 512, 1, 1024, 1900, 1900, 1900, 1024] + - [55, 16907.0] + - - [1700, 512, 1, 1024, 1700, 1700, 1700, 1024] + - [50, 15330.0] + - - [1610, 512, 1, 1024, 1610, 1610, 1610, 1024] + - [50, 14726.0] + - - [660, 512, 2, 2048, 660, 660, 660, 2048] + - [54, 13861.0] + - - [726, 512, 2, 2048, 726, 726, 726, 2048] + - [54, 15235.0] + - - [713, 512, 2, 2048, 713, 713, 713, 2048] + - [54, 14920.0] + - - [805, 256, 2, 2048, 805, 805, 805, 2048] + - [60, 13743.0] + - - [850, 256, 2, 2048, 850, 850, 850, 2048] + - [49, 13953.0] + - - [100, 128, 120, 512, 100, 100, 100, 512] + - [67, 13029.0] + - - [100, 128, 139, 512, 100, 100, 100, 512] + - [79, 13330.0] + - - [100, 128, 160, 512, 100, 100, 100, 512] + - [55, 13200.0] + - - [22500, 64, 1, 147, 22500, 22500, 22500, 147] + - [70, 11605.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [79, 14564.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [55, 14571.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [67, 14362.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [67, 14409.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [55, 13925.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [67, 14090.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [55, 13118.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [67, 14257.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [80, 12478.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [56, 12876.0] + - - [1024, 77, 1, 30522, 1024, 1024, 1024, 30522] + - [83, 8710.0] + - - [1024, 200, 1, 30522, 1024, 1024, 1024, 30522] + - [83, 11179.0] + - - [1024, 160, 1, 30522, 1024, 1024, 1024, 30522] + - [87, 13798.0] + - - [1024, 180, 1, 30522, 1024, 1024, 1024, 30522] + - [87, 15556.0] + - - [1024, 160, 1, 30528, 1024, 1024, 1024, 30528] + - [82, 13812.0] + - - [1024, 240, 1, 30528, 1024, 1024, 1024, 30528] + - [82, 15197.0] + - - [2560, 109, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 15750.0] + - - [2560, 121, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 17384.0] + - - [2560, 65, 1, 29000, 2560, 2560, 2560, 29000] + - [86, 9662.0] + - - [2560, 66, 1, 29000, 2560, 2560, 2560, 29000] + - [86, 9782.0] + - - [2560, 67, 1, 29000, 2560, 2560, 2560, 29000] + - [89, 9921.0] + - - [2560, 69, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 10227.0] + - - [2560, 70, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 10357.0] + - - [2560, 71, 1, 29000, 2560, 2560, 2560, 29000] + - [83, 10485.0] + - - [2560, 73, 1, 29000, 2560, 2560, 2560, 29000] + - [84, 10780.0] + - - [2560, 74, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 10931.0] + - - [2560, 75, 1, 29000, 2560, 2560, 2560, 29000] + - [86, 11091.0] + - - [2560, 77, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 11329.0] + - - [2560, 78, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 11464.0] + - - [2560, 80, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 11788.0] + - - [2560, 81, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 11896.0] + - - [2560, 82, 1, 29000, 2560, 2560, 2560, 29000] + - [86, 12053.0] + - - [2560, 83, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 12171.0] + - - [2560, 84, 1, 29000, 2560, 2560, 2560, 29000] + - [83, 12330.0] + - - [2560, 88, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 12825.0] + - - [2560, 89, 1, 29000, 2560, 2560, 2560, 29000] + - [85, 13016.0] + - - [2560, 90, 1, 29000, 2560, 2560, 2560, 29000] + - [86, 13169.0] + - - [2560, 92, 1, 29000, 2560, 2560, 2560, 29000] + - [86, 13401.0] + - - [2560, 95, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 13813.0] + - - [2560, 98, 1, 29000, 2560, 2560, 2560, 29000] + - [88, 14212.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [113, 7578.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [100, 7863.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [103, 5320.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [110, 8183.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [96, 5186.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [100, 8593.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [93, 5939.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [110, 5489.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [113, 6627.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [137, 6943.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [113, 3735.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [93, 5907.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [96, 6066.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [113, 7431.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [111, 5325.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [93, 9042.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [96, 3784.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [93, 9004.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [110, 7578.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [110, 8327.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [128, 6083.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [125, 5070.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [134, 4786.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [96, 7446.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [100, 6938.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [96, 4429.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [96, 5880.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [93, 8656.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [116, 6265.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [130, 3866.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [96, 3909.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [96, 2996.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [96, 6571.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [100, 8897.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [116, 5498.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [96, 3939.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [93, 8085.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [110, 8178.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [125, 8167.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [100, 7962.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [100, 8690.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [100, 4853.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [130, 5682.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [105, 7622.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [110, 4961.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [130, 6613.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [110, 7534.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [130, 7846.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [125, 8391.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [100, 7805.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [125, 8116.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [141, 3433.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [100, 4399.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [130, 6544.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [96, 6933.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [93, 6341.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [116, 8207.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [111, 6110.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [93, 5164.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [125, 8250.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [110, 8097.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [116, 8728.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [93, 8536.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [139, 7620.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [96, 6757.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [125, 5008.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [134, 8331.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [96, 6765.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [96, 7700.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [110, 6572.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [96, 3060.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [105, 5920.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [100, 5759.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [100, 6120.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [93, 8232.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [116, 5928.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [125, 8877.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [130, 2920.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [93, 8449.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [110, 7384.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [110, 8913.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [105, 6085.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [139, 6290.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [125, 8159.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [121, 6705.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [93, 7731.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [134, 7753.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [139, 6673.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [125, 8436.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [116, 8573.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [110, 9521.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [96, 7831.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [96, 5227.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [93, 8350.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [93, 7375.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [105, 6243.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [93, 8941.0] + - - [512, 512, 1, 64, 512, 512, 512, 64] + - [125, 2954.0] + - - [32, 33, 1600, 33, 32, 32, 32, 33] + - [132, 3111.0] + - - [256, 1024, 1, 1, 256, 256, 256, 1] + - [125, 59.0] + - - [257, 1024, 1, 4096, 257, 257, 257, 4096] + - [116, 7394.0] + - - [512, 200, 1, 1, 512, 512, 512, 1] + - [101, 55.0] + - - [512, 200, 1, 32, 512, 512, 512, 32] + - [142, 953.0] + - - [512, 215, 1, 2048, 512, 512, 512, 2048] + - [130, 7125.0] + - - [512, 256, 1, 2048, 512, 512, 512, 2048] + - [139, 6721.0] + - - [560, 200, 1, 1024, 560, 560, 560, 1024] + - [134, 5129.0] + - - [768, 215, 1, 2048, 768, 768, 768, 2048] + - [110, 7272.0] + - - [768, 256, 1, 2048, 768, 768, 768, 2048] + - [110, 8538.0] + - - [1024, 200, 1, 1, 1024, 1024, 1024, 1] + - [127, 93.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [139, 8820.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [134, 5244.0] + - - [64, 35, 4608, 32, 64, 64, 64, 32] + - [93, 5625.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [93, 5344.0] + - - [64, 33, 1920, 27, 64, 64, 64, 27] + - [93, 4647.0] + - - [64, 33, 1920, 33, 64, 64, 64, 33] + - [93, 4762.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 192] + - [93, 9798.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 192] + - [108, 8177.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 256] + - [134, 7598.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 288] + - [93, 7608.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 192] + - [93, 9306.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 192] + - [93, 7237.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 256] + - [134, 7383.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 288] + - [91, 7881.0] + - - [49, 2048, 64, 512, 49, 49, 49, 512] + - [139, 7085.0] + - - [49, 512, 64, 2048, 49, 49, 49, 2048] + - [139, 6949.0] + - - [49, 2048, 32, 512, 49, 49, 49, 512] + - [117, 7055.0] + - - [49, 512, 32, 2048, 49, 49, 49, 2048] + - [121, 6571.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 1024] + - [134, 7775.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 2048] + - [105, 7081.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 1024] + - [134, 7463.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 2048] + - [139, 6944.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [93, 7571.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [128, 9014.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [125, 8194.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [96, 8082.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 7066.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 8415.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 7419.0] + - - [256, 864, 1, 128, 256, 256, 256, 128] + - [125, 3988.0] + - - [3136, 64, 1, 576, 3136, 3136, 3136, 576] + - [134, 7084.0] + - - [784, 128, 1, 1152, 784, 784, 784, 1152] + - [105, 5553.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [96, 5795.0] + - - [1024, 128, 1, 2, 1024, 1024, 1024, 2] + - [99, 69.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [96, 5395.0] + - - [1024, 96, 1, 2, 1024, 1024, 1024, 2] + - [133, 51.0] + - - [49, 2048, 128, 512, 49, 49, 49, 512] + - [100, 7195.0] + - - [49, 2048, 256, 512, 49, 49, 49, 512] + - [134, 7621.0] + - - [49, 512, 128, 2048, 49, 49, 49, 2048] + - [121, 7079.0] + - - [49, 512, 256, 2048, 49, 49, 49, 2048] + - [139, 7140.0] + - - [100, 128, 18, 512, 100, 100, 100, 512] + - [96, 5777.0] + - - [100, 128, 19, 512, 100, 100, 100, 512] + - [105, 6080.0] + - - [1444, 128, 1, 576, 1444, 1444, 1444, 576] + - [93, 7692.0] + - - [361, 512, 1, 2304, 361, 361, 361, 2304] + - [134, 8275.0] + - - [2560, 35, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 4720.0] + - - [2560, 36, 1, 29000, 2560, 2560, 2560, 29000] + - [105, 4846.0] + - - [2560, 39, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 5249.0] + - - [2560, 40, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 5413.0] + - - [2560, 42, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 5659.0] + - - [2560, 43, 1, 29000, 2560, 2560, 2560, 29000] + - [105, 5797.0] + - - [2560, 44, 1, 29000, 2560, 2560, 2560, 29000] + - [105, 5927.0] + - - [2560, 46, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 6202.0] + - - [2560, 48, 1, 29000, 2560, 2560, 2560, 29000] + - [105, 6472.0] + - - [2560, 49, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 6632.0] + - - [2560, 50, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 6770.0] + - - [2560, 51, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 6891.0] + - - [2560, 53, 1, 29000, 2560, 2560, 2560, 29000] + - [105, 7151.0] + - - [2560, 54, 1, 29000, 2560, 2560, 2560, 29000] + - [105, 7259.0] + - - [2560, 55, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 7429.0] + - - [2560, 56, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 7568.0] + - - [2560, 57, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 7681.0] + - - [2560, 58, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 7831.0] + - - [2560, 59, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 7976.0] + - - [2560, 61, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 8259.0] + - - [2560, 63, 1, 29000, 2560, 2560, 2560, 29000] + - [96, 8516.0] + - - [1909283, 40, 1, 40, 1909283, 1909283, 1909283, 40] + - [116, 6114.0] + - - [3818566, 40, 1, 40, 3818566, 3818566, 3818566, 40] + - [116, 6126.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [151, 4775.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [164, 2950.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [163, 4377.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [150, 3861.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [148, 75.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [152, 6636.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [151, 1484.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [143, 166.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [151, 738.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [163, 1197.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [150, 3048.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [164, 2337.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [163, 1045.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [166, 4971.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [153, 5134.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [150, 520.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [156, 648.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [168, 593.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [162, 415.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [167, 2607.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [178, 824.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [151, 4340.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [164, 6105.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [173, 5843.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [158, 6785.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [179, 242.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [154, 1320.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [162, 1631.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [151, 506.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [143, 2.0] + - - [2560, 4, 1, 2, 2560, 2560, 2560, 2] + - [143, 5.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [168, 1397.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 512] + - [150, 189.0] + - - [12288, 12, 2, 256, 12288, 12288, 12288, 256] + - [170, 4725.0] + - - [12288, 3, 2, 256, 12288, 12288, 12288, 256] + - [172, 1767.0] + - - [51520, 12, 2, 256, 51520, 51520, 51520, 256] + - [145, 5703.0] + - - [51520, 3, 2, 256, 51520, 51520, 51520, 256] + - [175, 2608.0] + - - [15200, 12, 2, 256, 15200, 15200, 15200, 256] + - [147, 4946.0] + - - [15200, 3, 2, 256, 15200, 15200, 15200, 256] + - [144, 1911.0] + - - [3456, 3, 2, 256, 3456, 3456, 3456, 256] + - [148, 1144.0] + - - [13600, 12, 2, 256, 13600, 13600, 13600, 256] + - [160, 4780.0] + - - [12880, 3, 2, 256, 12880, 12880, 12880, 256] + - [144, 1856.0] + - - [3400, 3, 2, 256, 3400, 3400, 3400, 256] + - [161, 1126.0] + - - [12880, 12, 2, 256, 12880, 12880, 12880, 256] + - [177, 4441.0] + - - [13824, 12, 2, 256, 13824, 13824, 13824, 256] + - [176, 4876.0] + - - [13824, 3, 2, 256, 13824, 13824, 13824, 256] + - [177, 1952.0] + - - [13600, 3, 2, 256, 13600, 13600, 13600, 256] + - [149, 1909.0] + - - [3456, 12, 2, 256, 3456, 3456, 3456, 256] + - [170, 3360.0] + - - [3800, 3, 2, 256, 3800, 3800, 3800, 256] + - [161, 1231.0] + - - [3400, 12, 2, 256, 3400, 3400, 3400, 256] + - [170, 3305.0] + - - [3800, 12, 2, 256, 3800, 3800, 3800, 256] + - [170, 2671.0] + - - [55296, 3, 2, 256, 55296, 55296, 55296, 256] + - [165, 2526.0] + - - [3072, 3, 2, 256, 3072, 3072, 3072, 256] + - [161, 1053.0] + - - [3072, 12, 2, 256, 3072, 3072, 3072, 256] + - [153, 3432.0] + - - [54400, 3, 2, 256, 54400, 54400, 54400, 256] + - [172, 2383.0] + - - [60800, 12, 2, 256, 60800, 60800, 60800, 256] + - [152, 5425.0] + - - [60800, 3, 2, 256, 60800, 60800, 60800, 256] + - [172, 2393.0] + - - [3220, 3, 2, 256, 3220, 3220, 3220, 256] + - [171, 1039.0] + - - [3220, 12, 2, 256, 3220, 3220, 3220, 256] + - [146, 3201.0] + - - [2048, 8, 1, 2, 2048, 2048, 2048, 2] + - [159, 20.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [168, 2276.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [148, 6.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [155, 662.0] + - - [2560, 27, 1, 29000, 2560, 2560, 2560, 29000] + - [174, 5571.0] + - - [1909283, 11, 1, 11, 1909283, 1909283, 1909283, 11] + - [157, 2361.0] + - - [3818566, 11, 1, 11, 3818566, 3818566, 3818566, 11] + - [169, 2159.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [183, 3351.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [185, 430.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [188, 4471.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [182, 1129.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [189, 1705.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [183, 216.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [187, 862.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [180, 285.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [186, 570.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [184, 2265.0] + - - [49, 512, 1, 4608, 49, 49, 49, 4608] + - [181, 3948.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [98, 3202.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [105, 5668.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [120, 4161.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [138, 1432.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [113, 5393.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [130, 5066.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [113, 5650.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [98, 3502.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [98, 617.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [112, 4233.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [138, 414.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [129, 3578.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [138, 2211.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [98, 3534.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [122, 1245.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [115, 2632.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [112, 3600.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [106, 1211.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [105, 3510.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [120, 1814.0] + - - [1, 1, 1, 1280, 1, 1, 1, 1280] + - [94, 0.08] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [98, 2367.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [95, 2159.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [95, 4088.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [115, 3488.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [96, 4632.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [138, 3548.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [138, 2224.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [98, 2310.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [106, 1214.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [120, 193.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [138, 2869.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [120, 3518.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [118, 736.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [115, 1817.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [102, 2118.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [98, 1817.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [113, 2622.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [105, 4253.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [114, 561.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [141, 1374.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [138, 1301.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [115, 3073.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [135, 0.0001984126908217979] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [105, 4087.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [113, 5002.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [118, 384.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [98, 2332.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [121, 5357.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [120, 3460.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [104, 757.0] + - - [1, 1, 1, 256, 1, 1, 1, 256] + - [94, 0.04] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [113, 4766.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [102, 741.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [106, 1121.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [141, 579.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [98, 12.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [97, 7.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [122, 141.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [141, 53.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [98, 280.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [106, 144.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [122, 1978.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [141, 72.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [122, 70.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [141, 1159.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [98, 13.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [90, 3.0] + - - [64, 14, 1, 15, 64, 64, 64, 15] + - [90, 3.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [120, 7.0] + - - [64, 15, 1, 17, 64, 64, 64, 17] + - [90, 4.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [90, 4.0] + - - [64, 17, 1, 21, 64, 64, 64, 21] + - [90, 5.0] + - - [64, 21, 1, 21, 64, 64, 64, 21] + - [129, 7.0] + - - [64, 24, 1, 24, 64, 64, 64, 24] + - [94, 9.0] + - - [64, 24, 1, 34, 64, 64, 64, 34] + - [108, 16.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [94, 13.0] + - - [64, 31, 1, 30, 64, 64, 64, 30] + - [107, 14.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [98, 15.0] + - - [64, 32, 1, 32, 64, 64, 64, 32] + - [126, 17.0] + - - [64, 34, 1, 34, 64, 64, 64, 34] + - [95, 17.0] + - - [64, 35, 1, 32, 64, 64, 64, 32] + - [135, 17.0] + - - [64, 35, 1, 35, 64, 64, 64, 35] + - [109, 18.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [120, 2162.0] + - - [1024, 4, 1, 2, 1024, 1024, 1024, 2] + - [90, 2.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [106, 393.0] + - - [1024, 32, 1, 2, 1024, 1024, 1024, 2] + - [94, 17.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [115, 2593.0] + - - [32, 200, 1, 1, 32, 32, 32, 1] + - [90, 2.0] + - - [64, 3, 512, 3, 64, 64, 64, 3] + - [123, 53.0] + - - [64, 5, 512, 5, 64, 64, 64, 5] + - [110, 145.0] + - - [64, 5, 960, 5, 64, 64, 64, 5] + - [110, 213.0] + - - [64, 9, 512, 9, 64, 64, 64, 9] + - [124, 427.0] + - - [64, 512, 1, 1, 64, 64, 64, 1] + - [123, 9.0] + - - [67, 512, 1, 2048, 67, 67, 67, 2048] + - [138, 2925.0] + - - [74, 512, 1, 2048, 74, 74, 74, 2048] + - [138, 3239.0] + - - [74, 960, 1, 2048, 74, 74, 74, 2048] + - [105, 4348.0] + - - [100, 512, 1, 2048, 100, 100, 100, 2048] + - [139, 4211.0] + - - [128, 27, 32768, 27, 128, 128, 128, 27] + - [100, 4000.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [100, 2832.0] + - - [64, 14, 10880, 15, 64, 64, 64, 15] + - [100, 2896.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [100, 3052.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [100, 3102.0] + - - [64, 15, 7680, 17, 64, 64, 64, 17] + - [93, 3441.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [93, 3386.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [93, 3439.0] + - - [64, 17, 6144, 21, 64, 64, 64, 21] + - [93, 3979.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [100, 4707.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [93, 6377.0] + - - [64, 24, 4736, 34, 64, 64, 64, 34] + - [125, 6999.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [113, 6665.0] + - - [64, 31, 2048, 30, 64, 64, 64, 30] + - [113, 6780.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [113, 6831.0] + - - [64, 27, 1920, 27, 64, 64, 64, 27] + - [93, 5684.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [122, 958.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [96, 4941.0] + - - [1024, 10, 1, 2, 1024, 1024, 1024, 2] + - [90, 5.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [122, 1012.0] + - - [1024, 39, 1, 2, 1024, 1024, 1024, 2] + - [94, 21.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [129, 3016.0] + - - [1024, 40, 1, 2, 1024, 1024, 1024, 2] + - [92, 21.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [129, 3017.0] + - - [1024, 41, 1, 2, 1024, 1024, 1024, 2] + - [101, 21.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [129, 3084.0] + - - [1024, 5, 1, 2, 1024, 1024, 1024, 2] + - [101, 3.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [122, 489.0] + - - [1024, 6, 1, 2, 1024, 1024, 1024, 2] + - [90, 3.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [106, 590.0] + - - [1024, 8, 1, 2, 1024, 1024, 1024, 2] + - [90, 4.0] + - - [1024, 9, 1, 2, 1024, 1024, 1024, 2] + - [94, 5.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [106, 872.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [110, 327.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [93, 331.0] + - - [128, 128, 1, 64, 128, 128, 128, 64] + - [112, 546.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [102, 437.0] + - - [64, 5, 1, 5, 64, 64, 64, 5] + - [140, 1.0] + - - [32, 33, 1, 33, 32, 32, 32, 33] + - [92, 8.0] + - - [1024, 16, 1, 2, 1024, 1024, 1024, 2] + - [101, 9.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [106, 1626.0] + - - [1024, 1, 1, 2, 1024, 1024, 1024, 2] + - [92, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [98, 99.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 200] + - [104, 37.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1600] + - [98, 117.0] + - - [1024, 64, 1, 2, 1024, 1024, 1024, 2] + - [94, 35.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4231.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4472.0] + - - [1024, 80, 1, 2, 1024, 1024, 1024, 2] + - [99, 42.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [96, 4579.0] + - - [1024, 82, 1, 2, 1024, 1024, 1024, 2] + - [94, 43.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [131, 1208.0] + - - [1024, 12, 1, 2, 1024, 1024, 1024, 2] + - [99, 7.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [116, 5119.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [139, 5876.0] + - - [196, 256, 1, 2304, 196, 196, 196, 2304] + - [137, 4650.0] + - - [768, 3, 2, 256, 768, 768, 768, 256] + - [102, 322.0] + - - [768, 12, 2, 256, 768, 768, 768, 256] + - [120, 789.0] + - - [864, 12, 2, 256, 864, 864, 864, 256] + - [120, 870.0] + - - [864, 3, 2, 256, 864, 864, 864, 256] + - [102, 218.0] + - - [216, 3, 2, 256, 216, 216, 216, 256] + - [98, 56.0] + - - [176, 12, 2, 256, 176, 176, 176, 256] + - [104, 192.0] + - - [176, 3, 2, 256, 176, 176, 176, 256] + - [102, 47.0] + - - [192, 12, 2, 256, 192, 192, 192, 256] + - [136, 208.0] + - - [192, 3, 2, 256, 192, 192, 192, 256] + - [120, 53.0] + - - [216, 12, 2, 256, 216, 216, 216, 256] + - [102, 232.0] + - - [850, 3, 2, 256, 850, 850, 850, 256] + - [136, 220.0] + - - [850, 12, 2, 256, 850, 850, 850, 256] + - [138, 848.0] + - - [805, 12, 2, 256, 805, 805, 805, 256] + - [104, 816.0] + - - [805, 3, 2, 256, 805, 805, 805, 256] + - [102, 205.0] + - - [247, 3, 2, 256, 247, 247, 247, 256] + - [136, 65.0] + - - [950, 3, 2, 256, 950, 950, 950, 256] + - [102, 234.0] + - - [187, 12, 2, 256, 187, 187, 187, 256] + - [102, 191.0] + - - [247, 12, 2, 256, 247, 247, 247, 256] + - [102, 258.0] + - - [187, 3, 2, 256, 187, 187, 187, 256] + - [133, 58.0] + - - [228, 12, 2, 256, 228, 228, 228, 256] + - [136, 237.0] + - - [221, 12, 2, 256, 221, 221, 221, 256] + - [120, 236.0] + - - [950, 12, 2, 256, 950, 950, 950, 256] + - [104, 915.0] + - - [228, 3, 2, 256, 228, 228, 228, 256] + - [90, 63.0] + - - [221, 3, 2, 256, 221, 221, 221, 256] + - [102, 58.0] + - - [25, 128, 120, 256, 25, 25, 25, 256] + - [119, 3867.0] + - - [25, 128, 139, 256, 25, 25, 25, 256] + - [99, 4141.0] + - - [25, 128, 160, 256, 25, 25, 25, 256] + - [103, 4001.0] + - - [25, 128, 18, 256, 25, 25, 25, 256] + - [95, 1825.0] + - - [25, 128, 19, 256, 25, 25, 25, 256] + - [95, 1907.0] + - - [9, 128, 120, 256, 9, 9, 9, 256] + - [102, 1617.0] + - - [9, 128, 139, 256, 9, 9, 9, 256] + - [118, 1765.0] + - - [9, 128, 160, 256, 9, 9, 9, 256] + - [118, 1809.0] + - - [9, 128, 18, 256, 9, 9, 9, 256] + - [102, 781.0] + - - [9, 128, 19, 256, 9, 9, 9, 256] + - [118, 824.0] + - - [100, 512, 1, 2304, 100, 100, 100, 2304] + - [139, 4201.0] + - - [25, 256, 1, 1152, 25, 25, 25, 1152] + - [141, 678.0] + - - [9, 256, 1, 1152, 9, 9, 9, 1152] + - [120, 245.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [98, 1644.0] + - - [1024, 20, 1, 2, 1024, 1024, 1024, 2] + - [90, 10.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB_GB.yaml new file mode 100644 index 000000000..222fbf869 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_HB_GB.yaml @@ -0,0 +1,24968 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x8_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 16 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 8 + LSPA: 4 + LSPB: 128 + LVCA: 32 + LVCB: 1 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x8_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 256 + LSCB: 16 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 16 + LSPB: 128 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12544 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT128x8x32_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_HB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 24222.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [18, 23461.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [23, 24683.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 24013.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 25272.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [28, 21587.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 24867.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [31, 24303.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [18, 25298.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [21, 23152.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [10, 25263.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [7, 19394.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 24327.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [17, 20339.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [2, 25059.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 24697.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [18, 24035.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [27, 20475.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [6, 21405.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 21820.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [4, 21569.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 24838.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [28, 22135.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 25231.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [4, 23072.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [4, 22530.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 25127.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [4, 23709.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [7, 22837.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 23919.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 25369.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [18, 23730.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 24663.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [24, 22665.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [25, 19714.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 24767.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [18, 24077.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 25524.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [27, 21344.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [18, 24672.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [10, 23196.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [5, 25603.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [18, 24083.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [28, 18779.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [18, 24993.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [3, 18717.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 22547.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [21, 22269.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [3, 20789.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [23, 22871.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [28, 20174.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [3, 21288.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [3, 19297.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [20, 16888.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [28, 19434.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [18, 21886.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [4, 20932.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [0, 21806.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [17, 20680.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [4, 20136.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [27, 21095.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [23, 25047.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [33, 20249.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [28, 22712.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 24787.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 23880.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 23923.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 24695.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [10, 24750.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [18, 21410.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [0, 20435.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 25193.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 24635.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 24300.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 24346.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [28, 17236.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [21, 25465.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [28, 24069.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [35, 24056.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [4, 21493.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [3, 20420.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [17, 20824.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [31, 22403.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 24052.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 24564.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [24, 22863.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [35, 24793.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [31, 24715.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [31, 21644.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [10, 20962.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 24450.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 23997.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [4, 24034.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [10, 25354.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [4, 22534.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 25589.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [7, 23964.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [31, 19767.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [28, 22274.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 25089.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [20, 17788.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [18, 24574.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [4, 23857.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [18, 22749.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [21, 24257.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 25378.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 24286.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [4, 23706.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 24735.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [23, 24225.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 23298.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [21, 23144.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [4, 20990.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [35, 20569.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [12, 21551.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [18, 24018.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [19, 25424.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [4, 21866.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [28, 24041.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [10, 23875.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 21984.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 21203.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [19, 25452.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [7, 22698.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [18, 23138.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 25247.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [6, 19170.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [23, 25294.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [28, 24647.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [22, 17531.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [28, 24531.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [29, 25131.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 24004.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 24400.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [18, 23557.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [3, 20554.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [7, 24108.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [10, 23684.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [28, 23715.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [28, 20276.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [31, 24571.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [24, 21859.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [4, 24957.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 23493.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [18, 23846.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [19, 25767.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [36, 22414.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [14, 19280.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 24762.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [3, 19432.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [4, 24171.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [23, 23992.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 24802.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [18, 23782.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 23935.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [4, 24678.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [9, 19092.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 24668.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [18, 24266.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [18, 24126.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [13, 20691.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [31, 20153.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [4, 24631.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [23, 22548.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [21, 22847.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [3, 18712.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [4, 24435.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [19, 25695.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [5, 25712.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [19, 25030.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 24274.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [23, 23160.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [21, 23890.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [18, 21479.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [23, 24326.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [23, 24804.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [2, 24627.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [18, 24370.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [3, 21601.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [21, 23500.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [4, 23519.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [18, 20982.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [27, 21043.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [10, 24206.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [27, 20932.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [7, 20112.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [21, 20924.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [0, 19270.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [18, 23473.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [21, 22617.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [4, 24017.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [4, 23616.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [3, 18767.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [23, 22599.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 25135.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [29, 25636.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 24859.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [33, 19338.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [18, 22986.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [26, 25004.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 24374.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [17, 20735.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [7, 25418.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 24752.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [23, 25040.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [27, 21619.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [17, 21025.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [7, 21932.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 25014.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [18, 19252.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [17, 20135.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [18, 24011.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [7, 21647.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 25288.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [4, 22896.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [17, 21119.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 22303.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 24229.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [29, 21314.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [35, 21061.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [7, 24589.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [28, 21801.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [28, 21474.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 25254.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 25128.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [4, 22616.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 24609.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [18, 23703.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [10, 22525.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [18, 24889.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [18, 24398.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 23498.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [28, 22812.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 24852.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [21, 22694.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [17, 21370.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [18, 25202.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 25085.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 22145.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [28, 24848.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 25454.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [18, 22511.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 25004.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [4, 23431.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 25271.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [35, 23269.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [18, 24279.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [8, 20910.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [18, 24238.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [4, 24849.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [7, 23465.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [4, 20429.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [18, 23255.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [18, 24777.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 23093.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [17, 16107.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [4, 25090.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [18, 24125.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 21256.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [4, 24016.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [4, 22436.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [4, 20595.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [17, 20429.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 24070.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [24, 22004.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [0, 20098.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [18, 23679.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [25, 20343.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [10, 24326.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [4, 24080.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [4, 24707.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [4, 22275.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [18, 23682.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [18, 23658.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [26, 24859.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [30, 18393.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [4, 21390.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [35, 23512.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [28, 24062.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [4, 20316.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [19, 25569.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 21106.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [18, 23060.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [18, 25392.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [28, 20987.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [23, 21727.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [34, 19302.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 21326.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [18, 24499.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [7, 21190.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [4, 21315.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [28, 23128.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [3, 21557.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 24343.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [7, 24316.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [4, 20822.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [10, 21952.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [0, 18146.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [18, 24255.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [4, 22189.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [18, 23195.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [22, 18465.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [4, 21429.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [31, 24005.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [12, 16157.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [18, 24886.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [7, 22657.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [18, 23467.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 24509.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [7, 21506.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [18, 25271.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [3, 20606.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [4, 25326.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [18, 19777.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 24590.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [0, 20136.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [3, 20807.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [4, 21267.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [28, 24001.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [20, 18696.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [31, 24040.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [17, 20985.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 25101.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [18, 23678.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [18, 22344.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [18, 21297.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [18, 21207.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 24581.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [28, 24165.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [3, 21404.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [23, 23324.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [26, 24716.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [6, 21138.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [35, 22819.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [3, 21361.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [17, 15969.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [18, 24066.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [3, 20729.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 25208.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [10, 24451.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [3, 21879.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [23, 25338.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [10, 24194.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [10, 20457.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [0, 17604.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [18, 25259.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [18, 19508.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 19881.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [18, 24050.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [4, 19802.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [34, 15220.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [2, 24443.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [4, 24494.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [18, 23444.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [3, 20849.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [4, 22277.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [21, 24724.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [4, 24073.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [22, 20166.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [22, 16947.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [26, 24350.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [23, 25058.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [23, 25359.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [7, 22110.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 25015.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [28, 24123.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [4, 22297.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [28, 24170.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [35, 24558.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [19, 25758.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [28, 23717.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [18, 23928.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [4, 24583.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [14, 19755.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 24207.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [4, 24556.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [17, 21032.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [24, 23525.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [18, 24501.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [22, 18999.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [28, 22785.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [35, 24437.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [18, 22164.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [1, 21355.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [18, 22629.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [19, 25587.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [14, 19757.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [4, 25091.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [19, 25654.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [28, 24779.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [5, 25674.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [4, 23218.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [28, 24072.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [4, 24391.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [6, 19619.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [28, 24122.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [2, 24712.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [27, 21614.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [18, 23812.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [17, 21185.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [23, 24576.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [18, 25121.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [4, 20352.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [18, 20255.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [30, 19297.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [7, 23193.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [4, 22897.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [3, 18805.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [28, 24120.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [4, 25203.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [18, 24488.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [4, 23925.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [5, 25522.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [10, 21593.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [4, 23589.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [4, 23879.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [28, 23567.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [16, 25166.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [4, 21647.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [18, 21220.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [18, 25204.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [2, 24880.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [27, 21063.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [10, 20768.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [18, 23196.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [21, 24307.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [7, 23814.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [19, 25755.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [21, 24496.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [28, 22602.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [18, 23492.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 24655.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [18, 24528.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [28, 23874.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [35, 21363.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [4, 22718.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 25519.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [21, 19104.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [24, 22719.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [17, 21025.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [18, 22588.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [21, 24505.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [28, 21861.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [28, 24514.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [18, 24853.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [0, 19550.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [18, 23836.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [21, 22456.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [4, 22518.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [21, 22456.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [4, 24836.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [28, 22497.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [22, 20671.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [10, 22115.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [35, 21127.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [18, 24896.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [10, 20530.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [23, 22398.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [3, 19730.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [4, 25204.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [4, 24254.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [4, 24222.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 22158.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [4, 23705.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [27, 19500.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [23, 22534.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [4, 23623.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [19, 25646.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [28, 20033.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [24, 23787.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 25592.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [15, 20568.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 20836.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [15, 23760.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [32, 19076.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [4, 21187.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [4, 21603.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [31, 23559.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [21, 23322.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [0, 16139.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [35, 20281.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [28, 20889.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [18, 19807.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 18150.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [40, 19323.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [41, 19336.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [40, 20380.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [40, 19188.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [39, 16275.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [43, 14955.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [38, 17762.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [38, 18976.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [54, 16806.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [38, 17011.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [40, 14838.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [57, 18981.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [38, 18655.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [38, 13715.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [38, 14631.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [45, 11319.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [51, 16052.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [38, 11114.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [56, 13547.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [53, 15851.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [38, 18978.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [51, 20414.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [52, 10558.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [56, 15039.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [39, 8898.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [38, 16293.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [55, 10395.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [38, 17307.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [44, 14148.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [38, 17766.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [40, 14257.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [41, 19338.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [49, 15748.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [58, 20422.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [50, 17873.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [52, 14087.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [56, 18051.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [38, 12849.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [56, 17541.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [43, 14262.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [38, 16777.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [48, 16478.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [41, 18084.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [53, 9551.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [62, 18596.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [51, 17298.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [38, 11662.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [49, 16694.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [51, 17317.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [48, 14585.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [50, 17924.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [45, 18830.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [56, 19384.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [38, 15789.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [52, 15498.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [46, 18649.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [56, 18709.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [45, 18278.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [56, 18392.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [50, 18888.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [39, 14490.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [38, 16016.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [39, 9238.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [40, 18551.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [40, 18206.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [60, 13181.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [44, 15607.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [57, 18340.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [39, 17207.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [43, 16175.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [38, 17827.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [53, 14531.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [44, 10876.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [53, 14656.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [49, 15666.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [58, 17539.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [40, 18849.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [43, 12906.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [51, 18232.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [52, 15284.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [40, 19445.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [58, 17370.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [58, 21590.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [41, 17836.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [38, 13674.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [43, 10318.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [50, 17007.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [45, 14827.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [57, 19436.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [38, 15737.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [61, 9266.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [60, 18369.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [58, 20200.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [40, 19857.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [38, 12456.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [38, 16564.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [43, 10815.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [50, 17890.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [50, 18990.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [47, 15387.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [38, 17880.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [39, 16744.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [48, 17905.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [56, 17174.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [39, 15644.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [56, 14836.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [57, 15991.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [50, 16376.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [58, 20307.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [39, 9072.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [57, 17997.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [50, 14862.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [39, 17033.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [38, 17496.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [38, 17831.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [38, 13332.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [58, 19029.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [56, 14439.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [52, 13545.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [53, 9004.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [40, 14257.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [38, 16459.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [50, 20097.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [38, 15763.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [40, 15942.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [40, 18758.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [53, 16881.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [57, 19136.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [56, 11628.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [38, 17170.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [38, 18037.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [45, 18844.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [52, 10415.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [57, 18927.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [50, 19448.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [52, 16883.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [52, 15148.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [51, 19069.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [57, 18298.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [51, 19758.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [43, 15460.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [45, 18838.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [40, 19327.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [53, 13186.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [39, 15177.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [45, 14640.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [58, 19791.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [45, 13957.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [38, 19008.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [41, 18215.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [48, 13020.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [38, 14347.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [38, 18466.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [40, 16926.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [40, 18022.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [39, 18051.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [52, 15480.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [39, 16040.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [41, 17084.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [56, 18613.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [38, 12212.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [38, 14953.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [41, 18782.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [38, 14823.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [60, 16255.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [49, 15680.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [56, 15161.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [40, 19212.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [57, 18850.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [50, 18334.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [42, 16397.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [59, 16232.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [37, 11742.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [55, 19217.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [42, 16413.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [83, 8750.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [83, 9183.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [67, 5952.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [67, 7371.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [73, 9569.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [73, 9259.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [73, 9979.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [67, 5946.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [89, 9987.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [73, 8085.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [83, 8260.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [83, 10531.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [70, 7899.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [79, 7797.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [79, 5825.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [83, 8164.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [73, 7263.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [75, 6229.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [86, 7948.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [79, 6317.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [64, 10297.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [75, 5300.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [79, 9445.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [73, 10559.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [73, 9213.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [74, 10076.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [79, 9158.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [67, 6277.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [83, 7300.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [73, 7254.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [67, 7978.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [88, 8874.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [79, 6621.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [86, 6332.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [70, 9377.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [83, 9051.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [88, 8020.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [88, 5928.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [67, 5529.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [79, 4689.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [70, 7670.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [64, 10157.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [71, 8066.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [79, 5695.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [83, 9394.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [64, 9754.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [73, 9560.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [79, 9713.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [70, 9884.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [64, 9416.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [79, 7375.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [88, 6289.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [70, 9233.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [73, 7442.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [67, 7497.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [64, 9140.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [79, 8741.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [83, 9545.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [79, 9266.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [65, 9858.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [83, 9586.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [64, 9472.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [63, 4436.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [71, 6736.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [65, 7523.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [83, 8398.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [64, 8526.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [70, 9709.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [86, 6324.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [73, 7656.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [73, 10154.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [73, 9383.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [84, 9857.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [70, 9535.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [75, 8243.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [75, 7718.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [67, 7919.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [83, 5649.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [71, 9671.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [70, 8022.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [75, 8185.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [73, 7418.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [78, 4453.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [67, 6300.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [71, 8400.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [79, 8265.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [81, 6275.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [73, 9560.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [65, 8681.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [64, 8354.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [83, 9474.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [87, 7228.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [79, 4402.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [83, 9900.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [73, 9104.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [74, 10392.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [67, 7442.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [94, 5387.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [93, 4749.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [93, 3210.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [94, 3021.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [97, 7508.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [98, 506.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [103, 2823.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [91, 7499.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [103, 655.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [92, 7236.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [94, 723.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [103, 2025.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [102, 141.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [92, 5971.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [103, 3753.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [94, 758.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [103, 2587.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [98, 326.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [66, 4686.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [92, 6092.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [101, 646.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [103, 1287.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [99, 1403.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [96, 183.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [94, 1515.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [98, 1005.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [100, 6391.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [95, 1290.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [69, 136.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [77, 2172.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [69, 272.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [69, 73.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [69, 1086.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [69, 543.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [82, 1164.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [69, 147.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [69, 587.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [69, 294.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [72, 189.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [66, 3180.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [75, 4987.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [66, 4450.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [66, 2356.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [67, 5367.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [66, 15.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [85, 4946.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [75, 5388.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [80, 3186.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [66, 1520.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [68, 547.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [66, 47.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [66, 94.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [85, 4689.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [72, 617.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [66, 4464.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [66, 3688.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [66, 3389.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [76, 1082.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [69, 2022.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [66, 7.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [66, 3796.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [69, 953.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [66, 3762.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [66, 3051.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [72, 183.0] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [69, 2019.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [66, 3634.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [66, 4772.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [66, 3437.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [85, 5249.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [85, 3537.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [66, 2073.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [72, 1872.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [69, 953.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [72, 2913.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [72, 301.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [85, 3542.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [66, 3829.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [72, 1198.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [80, 1872.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [66, 2343.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [72, 1846.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [66, 3101.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [66, 4959.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [68, 481.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [69, 1076.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [72, 2062.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [72, 1503.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [66, 765.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [66, 3796.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [66, 14.0] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [66, 4939.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [67, 5271.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [72, 603.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [69, 2022.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [75, 4975.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [66, 3824.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [72, 1205.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [72, 368.0] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [66, 5249.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [90, 93.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [72, 1205.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 000000000..79264a493 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,70006 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU32_SUM3_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU32_SUM3_TT2_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SN_SU0_SUM0_TT1_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11750.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12336.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11075.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12211.0] + - - [768, 4096, 1, 2, 768, 768, 768, 2] + - [30, 316.0] + - - [768, 4096, 1, 768, 768, 768, 768, 768] + - [1, 11507.0] + - - [3072, 4096, 1, 768, 3072, 3072, 3072, 768] + - [1, 12260.0] + - - [768, 2048, 1, 2, 768, 768, 768, 2] + - [15, 342.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [1, 11006.0] + - - [3072, 2048, 1, 768, 3072, 3072, 3072, 768] + - [1, 11834.0] + - - [3072, 1024, 1, 768, 3072, 3072, 3072, 768] + - [1, 11534.0] + - - [3072, 512, 1, 768, 3072, 3072, 3072, 768] + - [1, 11010.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 11193.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 11851.0] + - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12349.0] + - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 10909.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12304.0] + - - [1024, 2048, 1, 2, 1024, 1024, 1024, 2] + - [10, 335.0] + - - [1024, 3072, 1, 2, 1024, 1024, 1024, 2] + - [7, 371.0] + - - [1024, 4096, 1, 2, 1024, 1024, 1024, 2] + - [10, 368.0] + - - [128, 128, 512, 64, 128, 128, 128, 64] + - [40, 10261.0] + - - [512, 512, 64, 64, 512, 512, 512, 64] + - [0, 10761.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [36, 12101.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [8, 11697.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [15, 11582.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [22, 12318.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [1, 11724.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12033.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [1, 11099.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 128] + - [0, 11030.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12332.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 12146.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [8, 12547.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 128] + - [8, 11234.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [1, 11580.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [8, 12513.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12201.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [0, 11486.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [15, 10952.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 11497.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12245.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [15, 9419.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 128] + - [0, 10051.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [15, 11016.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [28, 11350.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [7, 10622.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 128] + - [0, 10069.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12115.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [8, 11023.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [7, 10654.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [40, 9509.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [36, 11684.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [15, 11391.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [1, 11428.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [8, 12450.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [22, 11883.0] + - - [704, 5056, 1, 128, 704, 704, 704, 128] + - [15, 9904.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 12057.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12621.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 11978.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12272.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [35, 11496.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [7, 10456.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12297.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12042.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12608.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [15, 11339.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [16, 12253.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [36, 11805.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [7, 11411.0] + - - [448, 5888, 1, 128, 448, 448, 448, 128] + - [15, 9135.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 12093.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [22, 9633.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [16, 12420.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [15, 10310.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 11331.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 128] + - [15, 11049.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [15, 11565.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [0, 10985.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [16, 11591.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 11988.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [1, 10367.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 128] + - [22, 11964.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12658.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [7, 11211.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [26, 8631.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [6, 9737.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [28, 11229.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 11676.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [13, 7894.0] + - - [704, 2944, 1, 128, 704, 704, 704, 128] + - [23, 9104.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [15, 11009.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [16, 10626.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [15, 11205.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 128] + - [7, 11137.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [1, 12376.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [0, 10830.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12335.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 11962.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 11981.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12294.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [16, 12312.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [8, 10989.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [0, 10620.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12526.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [22, 12053.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12426.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12204.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [28, 11219.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 128] + - [36, 11969.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [1, 11126.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 128] + - [28, 11044.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 128] + - [0, 11008.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12619.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 12089.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [8, 11241.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 128] + - [28, 11267.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [28, 11017.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [27, 11087.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [8, 11769.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [28, 11279.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [35, 11502.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [22, 12306.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 12368.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [28, 11166.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [16, 10627.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [36, 12145.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12013.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [4, 11872.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 128] + - [15, 10313.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [22, 12544.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [1, 11260.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12646.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12198.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [16, 9903.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [15, 11264.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12410.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 128] + - [36, 11722.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 128] + - [28, 11175.0] + - - [448, 3584, 1, 128, 448, 448, 448, 128] + - [28, 8542.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [16, 12593.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 128] + - [1, 11357.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [8, 12234.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 128] + - [7, 9993.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 128] + - [7, 9664.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 128] + - [28, 11445.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 128] + - [21, 10791.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12385.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [36, 12096.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12646.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [15, 10804.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12157.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [8, 11965.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12360.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [22, 12221.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 11736.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 11646.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [35, 11086.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [36, 10863.0] + - - [256, 5056, 1, 128, 256, 256, 256, 128] + - [21, 9757.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [22, 11680.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12038.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 128] + - [15, 10974.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 12044.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [1, 11962.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 12619.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 11602.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [22, 11516.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 12082.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 128] + - [28, 11625.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 11344.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12390.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [0, 11130.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12610.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 12317.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 128] + - [35, 11503.0] + - - [448, 6784, 1, 128, 448, 448, 448, 128] + - [15, 9293.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 128] + - [22, 11691.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12681.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [29, 11636.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [36, 11611.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12541.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [15, 10531.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [8, 12550.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [28, 11112.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [15, 9546.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [22, 12275.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 128] + - [7, 10382.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [29, 12065.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 128] + - [1, 12104.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 128] + - [15, 10339.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [1, 12050.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [16, 11828.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12329.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 128] + - [0, 11551.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 11882.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 128] + - [15, 10680.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [1, 12260.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [36, 12455.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 128] + - [8, 11816.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 128] + - [0, 11189.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [8, 11654.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12060.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 128] + - [8, 12121.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [8, 12179.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 12569.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [35, 11529.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [0, 10654.0] + - - [448, 4288, 1, 128, 448, 448, 448, 128] + - [35, 8877.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [21, 10550.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 128] + - [28, 11368.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [15, 11322.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [16, 12002.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [8, 12091.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12072.0] + - - [704, 6784, 1, 128, 704, 704, 704, 128] + - [15, 10240.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [8, 12295.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [0, 10506.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 128] + - [7, 11433.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12346.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [16, 12203.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 128] + - [28, 10936.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 128] + - [8, 11887.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 12134.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [15, 11502.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [16, 11537.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [15, 10703.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 12230.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12350.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12667.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 12707.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 128] + - [8, 12065.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12138.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [16, 11783.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 11982.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [8, 11110.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 12264.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [22, 12373.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12269.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 12176.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [15, 10879.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 128] + - [15, 11063.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 128] + - [29, 11666.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [1, 11867.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [36, 10774.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [15, 11186.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [36, 11799.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 128] + - [8, 12002.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [28, 11254.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [6, 10264.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 12562.0] + - - [256, 5888, 1, 128, 256, 256, 256, 128] + - [28, 9330.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12317.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [7, 11436.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [28, 11499.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 128] + - [8, 11744.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [1, 12106.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 128] + - [15, 10561.0] + - - [448, 5056, 1, 128, 448, 448, 448, 128] + - [15, 8926.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [16, 12118.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [16, 11890.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 128] + - [0, 11376.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 128] + - [22, 11643.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [7, 11628.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12493.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 12656.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12399.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [15, 10650.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [1, 11893.0] + - - [3025, 256, 64, 64, 3025, 3025, 3025, 64] + - [0, 9253.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [16, 12389.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12177.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [7, 11086.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 12631.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12670.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12348.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [8, 12471.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 128] + - [8, 11880.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [22, 12642.0] + - - [704, 3584, 1, 128, 704, 704, 704, 128] + - [35, 9460.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [15, 11121.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 11810.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 128] + - [0, 11365.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12492.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [1, 11950.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [15, 10854.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 128] + - [36, 11626.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [15, 10871.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12027.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [15, 10493.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 128] + - [7, 9796.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [16, 11728.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [0, 11301.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 128] + - [16, 11772.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [15, 11455.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [22, 12241.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12136.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [8, 11267.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [16, 12014.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 128] + - [7, 10784.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 128] + - [8, 11104.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12624.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 128] + - [28, 10734.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 128] + - [15, 11598.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12526.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [7, 11333.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [22, 11656.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 12259.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 12041.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 128] + - [0, 11693.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [35, 11497.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 12441.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [15, 9544.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 128] + - [7, 10732.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 12157.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 128] + - [22, 11427.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 11773.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 128] + - [15, 9900.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 11534.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [16, 11538.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [22, 12379.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [28, 11377.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12567.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12514.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [29, 11293.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12436.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [22, 12603.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12295.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [1, 11662.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [15, 11090.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [1, 12232.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 12477.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 128] + - [28, 10877.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12538.0] + - - [704, 2368, 1, 128, 704, 704, 704, 128] + - [15, 8774.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [8, 11528.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [22, 11948.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 12114.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [1, 12063.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [8, 11718.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [15, 11161.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [7, 11464.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [1, 12369.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 11768.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [36, 10364.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12007.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12413.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 11626.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12533.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12099.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [1, 12053.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 10956.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 128] + - [0, 11577.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [1, 11504.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [28, 10568.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [0, 11388.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12110.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12382.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [21, 11497.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [0, 11034.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 11863.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [28, 10967.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [1, 12247.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 128] + - [0, 10404.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [16, 12091.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [29, 12331.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 128] + - [28, 11458.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 12063.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 11980.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 128] + - [0, 9796.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 11454.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [28, 10972.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [22, 11893.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [7, 10875.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 12136.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [22, 10381.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 11323.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [16, 11734.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [8, 12589.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [36, 11045.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [1, 11600.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [1, 11943.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [15, 11085.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [28, 10421.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 128] + - [15, 10859.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 128] + - [36, 11545.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [16, 11802.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [0, 11708.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 128] + - [8, 11574.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 12451.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 128] + - [7, 9444.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [8, 12478.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [15, 9933.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12186.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12126.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 12186.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 10804.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [15, 11299.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [0, 10855.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [20, 11339.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [15, 10137.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [22, 10964.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 128] + - [28, 11333.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 11406.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 128] + - [35, 11622.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [1, 11922.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12231.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 128] + - [16, 11812.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 128] + - [15, 10873.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12626.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 128] + - [7, 11104.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 128] + - [0, 10827.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 128] + - [28, 10836.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [22, 10381.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 12294.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 128] + - [15, 10675.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [7, 10934.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [15, 11208.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 128] + - [0, 10925.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [4, 10792.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 12133.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [8, 12122.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [28, 10038.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12092.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 128] + - [15, 11353.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12538.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 12032.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 12017.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 11468.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 128] + - [0, 11180.0] + - - [256, 6784, 1, 128, 256, 256, 256, 128] + - [15, 10050.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 128] + - [16, 11786.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 128] + - [8, 11585.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 128] + - [29, 11117.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [0, 11504.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12318.0] + - - [704, 5888, 1, 128, 704, 704, 704, 128] + - [15, 10007.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 128] + - [29, 11982.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [1, 11758.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12330.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 128] + - [28, 9943.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [1, 11667.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [28, 11353.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [15, 8983.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 11226.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12130.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12183.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [28, 11131.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12592.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [1, 12226.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 128] + - [28, 11207.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [28, 11632.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12525.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12648.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12240.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [36, 11047.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [15, 10000.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 12666.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 128] + - [15, 11154.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 128] + - [0, 11307.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [8, 10956.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [35, 10760.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12095.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [16, 10178.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [0, 10979.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [15, 9446.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [8, 12494.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12233.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [1, 12251.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [0, 11177.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [15, 11470.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 128] + - [7, 9184.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [8, 12093.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 128] + - [15, 11391.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [8, 11389.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [0, 10540.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [16, 12184.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [22, 12550.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 12506.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [22, 11171.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12498.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 12134.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [1, 11583.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 12103.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 12444.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [1, 12225.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12684.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 11529.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 11997.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [16, 12659.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [1, 12234.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [1, 11418.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12148.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 128] + - [8, 11861.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 128] + - [28, 10660.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [15, 11114.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 128] + - [36, 11859.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [22, 11770.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12255.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [15, 10930.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [21, 10440.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 128] + - [36, 11572.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [20, 9873.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 128] + - [15, 9657.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 128] + - [28, 11298.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 128] + - [36, 11506.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [6, 11642.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 128] + - [28, 10921.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [22, 12236.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [16, 11361.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 128] + - [1, 11672.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [8, 11451.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [8, 12618.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [28, 10638.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 128] + - [8, 12088.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12506.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [0, 11465.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [8, 12649.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 128] + - [15, 7713.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [11, 12515.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 128] + - [7, 10959.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 128] + - [39, 11827.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [29, 12179.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [0, 10332.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [28, 11380.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 12073.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 12244.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [15, 11408.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 11960.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [15, 11070.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12218.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12490.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [16, 12042.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [7, 10815.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [22, 10285.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [0, 10292.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 11589.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 11557.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [5, 10342.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [8, 12140.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [22, 12005.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 11145.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12246.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 11946.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 12618.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [7, 11145.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 128] + - [28, 11362.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 11813.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [1, 11844.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [8, 12488.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [1, 11007.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 128] + - [8, 11172.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12350.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [28, 11085.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [15, 11143.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 128] + - [28, 9915.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 128] + - [7, 10841.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 11680.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [16, 12032.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 11886.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12551.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [22, 12181.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [1, 11504.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [1, 11731.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [36, 12121.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12335.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [16, 12242.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [16, 9979.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 128] + - [7, 10918.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [16, 11871.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [15, 11129.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [7, 9859.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [8, 10998.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 128] + - [35, 8899.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [28, 11272.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 128] + - [28, 11361.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [28, 11025.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 12156.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [28, 11229.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [6, 10514.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 128] + - [15, 11528.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [19, 12141.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [8, 12346.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [0, 9861.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 128] + - [7, 11087.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 11948.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [28, 11270.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12511.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [28, 11305.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [7, 11447.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 128] + - [28, 9754.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [15, 11233.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12364.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [1, 11368.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [7, 10745.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [29, 11354.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 128] + - [28, 11570.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 10920.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [36, 12362.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [22, 10728.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12380.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 128] + - [21, 10065.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 128] + - [21, 9642.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12534.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [16, 12164.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 12079.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [28, 11318.0] + - - [704, 4288, 1, 128, 704, 704, 704, 128] + - [28, 9675.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [16, 11854.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [28, 10591.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [12, 11336.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 128] + - [15, 10356.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [7, 11447.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [8, 12226.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12332.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [15, 11349.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12619.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [7, 10244.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [11, 11636.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [4, 12555.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 128] + - [7, 10313.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [32, 10578.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [4, 10486.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [0, 7705.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [4, 10339.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [32, 10741.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 8591.0] + - - [4096, 512, 1, 1024, 4096, 4096, 4096, 1024] + - [15, 11249.0] + - - [2048, 768, 1, 512, 2048, 2048, 2048, 512] + - [8, 10939.0] + - - [4096, 512, 1, 2048, 4096, 4096, 4096, 2048] + - [15, 11324.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 2048] + - [8, 11976.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 11133.0] + - - [2048, 1024, 1, 4096, 2048, 2048, 2048, 4096] + - [7, 11270.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [25, 11830.0] + - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] + - [0, 11138.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 12184.0] + - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] + - [15, 11262.0] + - - [4096, 384, 1, 2048, 4096, 4096, 4096, 2048] + - [1, 11300.0] + - - [1225, 192, 64, 384, 1225, 1225, 1225, 384] + - [28, 11249.0] + - - [289, 128, 64, 1024, 289, 289, 289, 1024] + - [11, 8480.0] + - - [4096, 384, 1, 1536, 4096, 4096, 4096, 1536] + - [1, 11315.0] + - - [289, 192, 64, 1024, 289, 289, 289, 1024] + - [24, 8318.0] + - - [4096, 384, 1, 1280, 4096, 4096, 4096, 1280] + - [1, 11250.0] + - - [4096, 448, 1, 1280, 4096, 4096, 4096, 1280] + - [21, 10690.0] + - - [289, 256, 64, 1024, 289, 289, 289, 1024] + - [11, 8772.0] + - - [4096, 448, 1, 2048, 4096, 4096, 4096, 2048] + - [0, 10791.0] + - - [289, 384, 64, 1024, 289, 289, 289, 1024] + - [11, 9116.0] + - - [1024, 3594, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11563.0] + - - [4096, 3103, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12076.0] + - - [4096, 3136, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12287.0] + - - [1024, 3141, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11901.0] + - - [4096, 3559, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12299.0] + - - [4096, 3368, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11991.0] + - - [1024, 3335, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11515.0] + - - [1024, 3510, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10710.0] + - - [4096, 3209, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11951.0] + - - [4096, 3322, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12355.0] + - - [1024, 3400, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11823.0] + - - [1024, 3995, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11757.0] + - - [1024, 3503, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11580.0] + - - [4096, 3594, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11950.0] + - - [4096, 3473, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12102.0] + - - [4096, 3522, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12264.0] + - - [1024, 3103, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11710.0] + - - [1024, 3214, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10973.0] + - - [4096, 3449, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12240.0] + - - [1024, 3136, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11867.0] + - - [1024, 3955, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11687.0] + - - [1024, 3780, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 12058.0] + - - [1024, 3906, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11530.0] + - - [1024, 3386, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11757.0] + - - [4096, 3396, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12032.0] + - - [1024, 3183, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12044.0] + - - [1024, 3098, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11540.0] + - - [1024, 3548, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10859.0] + - - [1024, 3224, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10978.0] + - - [4096, 3469, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12031.0] + - - [1024, 3582, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11495.0] + - - [1024, 2977, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11264.0] + - - [1024, 3939, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 11212.0] + - - [4096, 3176, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12374.0] + - - [1024, 3559, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11398.0] + - - [1024, 3478, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10635.0] + - - [4096, 3343, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11880.0] + - - [4096, 3440, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12269.0] + - - [1024, 3996, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11788.0] + - - [1024, 4012, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11817.0] + - - [1024, 3322, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11502.0] + - - [1024, 3990, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11766.0] + - - [1024, 3314, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11519.0] + - - [4096, 3513, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12152.0] + - - [1024, 3562, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11501.0] + - - [1024, 3443, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11121.0] + - - [1024, 3554, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10821.0] + - - [1024, 3063, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11526.0] + - - [4096, 3460, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11995.0] + - - [1024, 3209, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11174.0] + - - [1024, 3147, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 11870.0] + - - [4096, 3387, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12019.0] + - - [4096, 3436, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12218.0] + - - [1024, 3341, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11480.0] + - - [1024, 3516, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 10729.0] + - - [4096, 3277, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12157.0] + - - [1024, 3454, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11960.0] + - - [1024, 3969, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10205.0] + - - [1024, 3999, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11785.0] + - - [1024, 4032, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11836.0] + - - [4096, 3541, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12231.0] + - - [4096, 3334, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11910.0] + - - [1024, 3365, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11582.0] + - - [1024, 3527, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10776.0] + - - [1024, 3190, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11816.0] + - - [4096, 3906, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12194.0] + - - [1024, 3593, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11569.0] + - - [1024, 3336, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10928.0] + - - [4096, 3504, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12147.0] + - - [4096, 3977, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12009.0] + - - [1024, 3906, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11611.0] + - - [4096, 3415, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12127.0] + - - [1024, 3295, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11382.0] + - - [4096, 3321, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12283.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11797.0] + - - [1024, 3408, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10999.0] + - - [1024, 3522, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10694.0] + - - [4096, 3751, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12179.0] + - - [4096, 3378, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12003.0] + - - [1024, 3925, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11586.0] + - - [1024, 3990, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11491.0] + - - [1024, 3290, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11497.0] + - - [4096, 3500, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12131.0] + - - [4096, 3565, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12392.0] + - - [1024, 3484, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11512.0] + - - [4096, 3395, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12050.0] + - - [1024, 3681, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11483.0] + - - [1024, 3584, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11267.0] + - - [4096, 3093, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [1024, 4050, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11620.0] + - - [1024, 3301, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11394.0] + - - [1024, 3581, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10799.0] + - - [4096, 3374, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11976.0] + - - [1024, 3449, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12054.0] + - - [4096, 3215, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11895.0] + - - [4096, 3312, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12323.0] + - - [4096, 3479, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12112.0] + - - [4096, 3544, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12303.0] + - - [1024, 3263, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11246.0] + - - [4096, 3455, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12253.0] + - - [1024, 3379, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11650.0] + - - [1024, 3490, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 11397.0] + - - [1024, 3368, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 11504.0] + - - [4096, 3186, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12344.0] + - - [1024, 3428, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11817.0] + - - [4096, 3561, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12347.0] + - - [4096, 3418, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12190.0] + - - [1024, 3064, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11698.0] + - - [4096, 3259, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12130.0] + - - [4096, 3308, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12323.0] + - - [1024, 3533, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11421.0] + - - [1024, 3344, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10941.0] + - - [1024, 4030, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11281.0] + - - [4096, 3459, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12032.0] + - - [1024, 3572, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11518.0] + - - [1024, 3925, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11145.0] + - - [4096, 3435, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12224.0] + - - [1024, 3956, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11727.0] + - - [1024, 3463, 1, 4096, 1024, 1024, 1024, 4096] + - [0, 11211.0] + - - [4096, 3182, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12367.0] + - - [4096, 3976, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12019.0] + - - [1024, 3417, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11874.0] + - - [1024, 3528, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10794.0] + - - [4096, 3446, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12226.0] + - - [1024, 3543, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11437.0] + - - [4096, 3287, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12192.0] + - - [1024, 3499, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11623.0] + - - [1024, 3231, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10989.0] + - - [4096, 3519, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12183.0] + - - [4096, 3552, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12329.0] + - - [1024, 3458, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11130.0] + - - [1024, 3374, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10992.0] + - - [1024, 3396, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11016.0] + - - [1024, 2967, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11194.0] + - - [4096, 3482, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12084.0] + - - [1024, 3226, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11181.0] + - - [4096, 3377, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12020.0] + - - [4096, 3426, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12225.0] + - - [4096, 2935, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12393.0] + - - [1024, 3439, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11945.0] + - - [4096, 3267, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12152.0] + - - [4096, 3499, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12188.0] + - - [4096, 3356, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11993.0] + - - [4096, 3939, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12352.0] + - - [1024, 3526, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11289.0] + - - [1024, 3859, 1, 33708, 1024, 1024, 1024, 33708] + - [36, 11413.0] + - - [1024, 3385, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11760.0] + - - [1024, 3496, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10747.0] + - - [4096, 3141, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12239.0] + - - [4096, 3510, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12224.0] + - - [1024, 3434, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11923.0] + - - [4096, 3969, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11957.0] + - - [1024, 3121, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11795.0] + - - [1024, 3232, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11338.0] + - - [1024, 4030, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11880.0] + - - [1024, 3780, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 12108.0] + - - [1024, 3969, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11476.0] + - - [4096, 3527, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12270.0] + - - [4096, 3336, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11904.0] + - - [4096, 3290, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12279.0] + - - [1024, 3469, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11436.0] + - - [4096, 3490, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12125.0] + - - [4096, 3064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12287.0] + - - [4096, 3582, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12438.0] + - - [1024, 3956, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11424.0] + - - [4096, 3417, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12181.0] + - - [1024, 2736, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11463.0] + - - [1024, 3205, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11094.0] + - - [1024, 3143, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11599.0] + - - [1024, 4020, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11868.0] + - - [1024, 3318, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11455.0] + - - [4096, 3364, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11926.0] + - - [1024, 3353, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11560.0] + - - [1024, 3464, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10606.0] + - - [4096, 3205, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11921.0] + - - [4096, 3318, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12325.0] + - - [1024, 3402, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11739.0] + - - [4096, 3181, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12364.0] + - - [4096, 3550, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12322.0] + - - [4096, 3445, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12266.0] + - - [1024, 3138, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11820.0] + - - [4096, 3079, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11980.0] + - - [4096, 3144, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12290.0] + - - [4096, 3860, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12124.0] + - - [1024, 3515, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11583.0] + - - [4096, 3408, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [1024, 3181, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12006.0] + - - [4096, 3298, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12252.0] + - - [4096, 3585, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11977.0] + - - [1024, 3550, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11447.0] + - - [1024, 4020, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11532.0] + - - [4096, 3481, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12074.0] + - - [4096, 3530, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12291.0] + - - [4096, 3425, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12196.0] + - - [4096, 4026, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12140.0] + - - [1024, 3860, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11179.0] + - - [4096, 3975, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11984.0] + - - [1024, 3286, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11332.0] + - - [1024, 3176, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11790.0] + - - [1024, 3894, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11573.0] + - - [4096, 3355, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11905.0] + - - [4096, 3404, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12122.0] + - - [1024, 3501, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11562.0] + - - [4096, 3245, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12033.0] + - - [1024, 3431, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11913.0] + - - [1024, 4000, 1, 1024, 1024, 1024, 1024, 1024] + - [39, 11200.0] + - - [4096, 3509, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12194.0] + - - [4096, 3558, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12371.0] + - - [1024, 3535, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11329.0] + - - [1024, 3414, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10993.0] + - - [1024, 3445, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11094.0] + - - [1024, 3436, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11078.0] + - - [4096, 3472, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12053.0] + - - [1024, 3211, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11127.0] + - - [4096, 3383, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12052.0] + - - [4096, 3448, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12315.0] + - - [1024, 3343, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11570.0] + - - [1024, 3518, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 10717.0] + - - [4096, 3289, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12216.0] + - - [1024, 3440, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11907.0] + - - [1024, 4032, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11905.0] + - - [4096, 3489, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12146.0] + - - [4096, 3346, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11958.0] + - - [1024, 3534, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11410.0] + - - [1024, 3079, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11431.0] + - - [1024, 3955, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10122.0] + - - [4096, 3236, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12041.0] + - - [1024, 3545, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11444.0] + - - [1024, 3144, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11650.0] + - - [4096, 3780, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12271.0] + - - [4096, 3163, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12377.0] + - - [4096, 3468, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [1024, 3539, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11443.0] + - - [1024, 3541, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10794.0] + - - [4096, 3363, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11969.0] + - - [1024, 3475, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11133.0] + - - [4096, 3110, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12151.0] + - - [1024, 3509, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11624.0] + - - [1024, 3413, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11020.0] + - - [1024, 3975, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11116.0] + - - [4096, 3549, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12318.0] + - - [4096, 3342, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11935.0] + - - [1024, 2985, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11369.0] + - - [1024, 3876, 1, 33708, 1024, 1024, 1024, 33708] + - [29, 11469.0] + - - [4096, 3280, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12238.0] + - - [4096, 3191, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12457.0] + - - [4096, 3512, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12272.0] + - - [1024, 3560, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11485.0] + - - [4096, 2499, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12119.0] + - - [1024, 3248, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11278.0] + - - [4096, 3423, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12157.0] + - - [4096, 3297, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12290.0] + - - [4096, 3154, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12349.0] + - - [1024, 3303, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11445.0] + - - [1024, 3222, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10967.0] + - - [1024, 3978, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11460.0] + - - [4096, 3529, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12226.0] + - - [4096, 3386, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12084.0] + - - [1024, 3451, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11971.0] + - - [4096, 3562, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12294.0] + - - [4096, 3276, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12177.0] + - - [1024, 3894, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11500.0] + - - [4096, 3540, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12323.0] + - - [1024, 3416, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11953.0] + - - [1024, 4005, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11814.0] + - - [1024, 3942, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11698.0] + - - [4096, 3403, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12099.0] + - - [4096, 3381, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12035.0] + - - [1024, 3492, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11209.0] + - - [4096, 3101, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12054.0] + - - [1024, 3430, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11873.0] + - - [1024, 3977, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10005.0] + - - [1024, 3640, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 10718.0] + - - [4096, 3557, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12275.0] + - - [4096, 3414, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12154.0] + - - [1024, 3391, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11706.0] + - - [1024, 3356, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10931.0] + - - [4096, 3320, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12339.0] + - - [4096, 2765, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12037.0] + - - [1024, 3411, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11783.0] + - - [1024, 3978, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11776.0] + - - [4096, 3487, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12111.0] + - - [4096, 3520, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12232.0] + - - [4096, 3942, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12343.0] + - - [4096, 3431, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12235.0] + - - [1024, 3271, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11331.0] + - - [4096, 4020, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12093.0] + - - [1024, 3481, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11171.0] + - - [1024, 3419, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11853.0] + - - [1024, 4059, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 12039.0] + - - [4096, 3345, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11910.0] + - - [4096, 3394, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12100.0] + - - [1024, 3298, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11471.0] + - - [4096, 3235, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12032.0] + - - [1024, 3681, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 11804.0] + - - [1024, 3362, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11667.0] + - - [4096, 3467, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12018.0] + - - [1024, 3349, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11571.0] + - - [1024, 3460, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10615.0] + - - [4096, 3214, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11970.0] + - - [1024, 3398, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11761.0] + - - [4096, 3478, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12046.0] + - - [1024, 4050, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11951.0] + - - [1024, 3244, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11361.0] + - - [4096, 3341, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11859.0] + - - [4096, 3454, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12322.0] + - - [1024, 3166, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11973.0] + - - [1024, 3425, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10884.0] + - - [4096, 3295, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12204.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12338.0] + - - [4096, 3822, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12430.0] + - - [1024, 3681, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11777.0] + - - [1024, 4050, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11918.0] + - - [4096, 3495, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12129.0] + - - [4096, 3560, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12384.0] + - - [1024, 3524, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11274.0] + - - [1024, 3942, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11646.0] + - - [1024, 3304, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11494.0] + - - [1024, 3387, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11003.0] + - - [1024, 3498, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10705.0] + - - [4096, 3458, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11990.0] + - - [4096, 2967, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11934.0] + - - [4096, 3385, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12070.0] + - - [4096, 3434, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12235.0] + - - [1024, 3519, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11657.0] + - - [1024, 3511, 1, 4096, 1024, 1024, 1024, 4096] + - [35, 11283.0] + - - [1024, 3288, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11395.0] + - - [1024, 2918, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11080.0] + - - [4096, 3573, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12364.0] + - - [1024, 3822, 1, 33708, 1024, 1024, 1024, 33708] + - [20, 12244.0] + - - [4096, 3539, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12302.0] + - - [4096, 3332, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11915.0] + - - [4096, 3286, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12272.0] + - - [1024, 4026, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11869.0] + - - [1024, 3277, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11381.0] + - - [1024, 3471, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10649.0] + - - [4096, 3518, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12210.0] + - - [1024, 3393, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11717.0] + - - [4096, 3413, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12117.0] + - - [4096, 3303, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12308.0] + - - [1024, 3207, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11124.0] + - - [1024, 3894, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 11044.0] + - - [1024, 3977, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11500.0] + - - [4096, 3535, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12244.0] + - - [4096, 3376, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12041.0] + - - [1024, 3355, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11599.0] + - - [1024, 3466, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10605.0] + - - [4096, 3266, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12110.0] + - - [1024, 3404, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11770.0] + - - [1024, 3999, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11240.0] + - - [4096, 3498, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12132.0] + - - [1024, 4032, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11588.0] + - - [1024, 3410, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11858.0] + - - [4096, 3393, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12043.0] + - - [1024, 3140, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11886.0] + - - [1024, 3910, 1, 33708, 1024, 1024, 1024, 33708] + - [29, 11555.0] + - - [1024, 3334, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11592.0] + - - [4096, 3140, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12197.0] + - - [1024, 4005, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11781.0] + - - [1024, 3579, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11432.0] + - - [4096, 3372, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11968.0] + - - [1024, 3245, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11264.0] + - - [4096, 3956, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12312.0] + - - [4096, 3213, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11966.0] + - - [1024, 3361, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11608.0] + - - [1024, 3536, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10815.0] + - - [4096, 3477, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12069.0] + - - [4096, 3526, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12230.0] + - - [1024, 4005, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11528.0] + - - [1024, 3530, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11302.0] + - - [1024, 3944, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 11608.0] + - - [4096, 3453, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12261.0] + - - [4096, 3184, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12411.0] + - - [4096, 3579, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12464.0] + - - [4096, 3351, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11982.0] + - - [4096, 3416, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12235.0] + - - [1024, 3822, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 12173.0] + - - [1024, 3796, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 12110.0] + - - [4096, 3257, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12109.0] + - - [4096, 3306, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12295.0] + - - [1024, 3505, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11600.0] + - - [1024, 3315, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11091.0] + - - [1024, 3486, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10733.0] + - - [4096, 3457, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11980.0] + - - [4096, 3870, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12130.0] + - - [1024, 3447, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11910.0] + - - [1024, 3558, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10772.0] + - - [4096, 3433, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12174.0] + - - [4096, 3180, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12403.0] + - - [1024, 3213, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11136.0] + - - [1024, 3900, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11604.0] + - - [4096, 3444, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12233.0] + - - [1024, 3504, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11237.0] + - - [4096, 4059, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12201.0] + - - [1024, 3442, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11851.0] + - - [4096, 3517, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12201.0] + - - [1024, 3566, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11508.0] + - - [4096, 3248, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12089.0] + - - [1024, 3547, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11446.0] + - - [1024, 3340, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10937.0] + - - [4096, 3480, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12071.0] + - - [4096, 3424, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12215.0] + - - [1024, 3906, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11336.0] + - - [4096, 3265, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12171.0] + - - [1024, 3384, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11750.0] + - - [1024, 3494, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10713.0] + - - [1024, 3236, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10985.0] + - - [4096, 3497, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12087.0] + - - [4096, 3354, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11954.0] + - - [4096, 3055, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12253.0] + - - [4096, 3244, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12100.0] + - - [4096, 3139, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12272.0] + - - [4096, 3508, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12198.0] + - - [4096, 4050, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12207.0] + - - [1024, 3472, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11478.0] + - - [1024, 3861, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10957.0] + - - [1024, 3910, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 11177.0] + - - [4096, 3371, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12005.0] + - - [1024, 3751, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11988.0] + - - [4096, 3325, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12347.0] + - - [1024, 3321, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11550.0] + - - [1024, 3944, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11439.0] + - - [4096, 3525, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12237.0] + - - [4096, 3382, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12081.0] + - - [1024, 3453, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11987.0] + - - [4096, 3564, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12338.0] + - - [4096, 3288, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12270.0] + - - [1024, 3925, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11671.0] + - - [1024, 3057, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11642.0] + - - [4096, 3488, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12089.0] + - - [4096, 3046, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12233.0] + - - [1024, 3189, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12036.0] + - - [4096, 3399, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12062.0] + - - [1024, 3383, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 11795.0] + - - [1024, 3415, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11937.0] + - - [1024, 3388, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11660.0] + - - [1024, 3376, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10923.0] + - - [1024, 3473, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10593.0] + - - [4096, 3162, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12253.0] + - - [1024, 3448, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12025.0] + - - [4096, 3362, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11913.0] + - - [1024, 3262, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11319.0] + - - [1024, 3184, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11821.0] + - - [1024, 3378, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11007.0] + - - [4096, 3548, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12300.0] + - - [4096, 2977, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11970.0] + - - [4096, 3443, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12277.0] + - - [1024, 3289, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11392.0] + - - [1024, 3483, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10669.0] + - - [4096, 3190, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12423.0] + - - [1024, 3421, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11923.0] + - - [1024, 3514, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10775.0] + - - [1024, 3532, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10746.0] + - - [1024, 3565, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10868.0] + - - [4096, 3422, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12147.0] + - - [4096, 3263, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12156.0] + - - [4096, 3296, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12293.0] + - - [4096, 3640, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12165.0] + - - [4096, 3463, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12059.0] + - - [4096, 3528, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12279.0] + - - [1024, 3351, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11631.0] + - - [1024, 3462, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10572.0] + - - [4096, 3226, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11994.0] + - - [4096, 3439, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12238.0] + - - [4096, 3121, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12177.0] + - - [1024, 4059, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11978.0] + - - [1024, 3311, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11518.0] + - - [1024, 3230, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10991.0] + - - [4096, 3353, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11917.0] + - - [4096, 3402, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12121.0] + - - [1024, 3427, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11950.0] + - - [1024, 3346, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10867.0] + - - [1024, 3126, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11553.0] + - - [1024, 3796, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11473.0] + - - [1024, 3990, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11774.0] + - - [1024, 3257, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11306.0] + - - [4096, 3996, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12032.0] + - - [1024, 3306, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11424.0] + - - [1024, 3389, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11032.0] + - - [1024, 3500, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10702.0] + - - [1024, 3999, 1, 33708, 1024, 1024, 1024, 33708] + - [25, 11837.0] + - - [4096, 3486, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12111.0] + - - [1024, 3438, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11934.0] + - - [4096, 3616, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12008.0] + - - [1024, 3955, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11407.0] + - - [4096, 3430, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12211.0] + - - [4096, 3271, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12197.0] + - - [1024, 3364, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11594.0] + - - [1024, 3497, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10747.0] + - - [4096, 3503, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12132.0] + - - [4096, 3344, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11937.0] + - - [1024, 3457, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11051.0] + - - [4096, 3466, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12020.0] + - - [1024, 3976, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11732.0] + - - [1024, 3395, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11780.0] + - - [4096, 3361, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11968.0] + - - [1024, 3751, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 12018.0] + - - [1024, 3822, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11756.0] + - - [4096, 3315, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12339.0] + - - [1024, 3163, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12081.0] + - - [4096, 3547, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12263.0] + - - [4096, 3340, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11918.0] + - - [1024, 3296, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11378.0] + - - [1024, 3468, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10587.0] + - - [4096, 3294, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12207.0] + - - [1024, 3406, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11793.0] + - - [1024, 3860, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11410.0] + - - [1024, 3584, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11559.0] + - - [4096, 3189, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12386.0] + - - [4096, 3494, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12185.0] + - - [1024, 3093, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11657.0] + - - [4096, 3421, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12154.0] + - - [1024, 3479, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11507.0] + - - [1024, 3433, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11969.0] + - - [4096, 3311, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12256.0] + - - [1024, 3381, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11760.0] + - - [1024, 3996, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11784.0] + - - [4096, 3384, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12010.0] + - - [1024, 3247, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11253.0] + - - [1024, 3169, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11761.0] + - - [1024, 3088, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 11486.0] + - - [1024, 3363, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10958.0] + - - [1024, 3538, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10807.0] + - - [1024, 3996, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11170.0] + - - [4096, 3169, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12339.0] + - - [4096, 3538, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12255.0] + - - [4096, 3401, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12129.0] + - - [4096, 3581, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12428.0] + - - [1024, 3180, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12014.0] + - - [1024, 3870, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10997.0] + - - [4096, 3555, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12345.0] + - - [4096, 3412, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12170.0] + - - [4096, 3302, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12320.0] + - - [1024, 3561, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11486.0] + - - [1024, 3302, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10865.0] + - - [1024, 3976, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 9990.0] + - - [4096, 3485, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12076.0] + - - [4096, 3534, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12316.0] + - - [1024, 3110, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11750.0] + - - [1024, 3401, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10889.0] + - - [4096, 3216, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11974.0] + - - [1024, 4020, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11867.0] + - - [1024, 3215, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11267.0] + - - [4096, 3566, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12309.0] + - - [1024, 3137, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11842.0] + - - [4096, 3359, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11958.0] + - - [4096, 3392, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12125.0] + - - [1024, 3506, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11638.0] + - - [4096, 3233, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12044.0] + - - [1024, 3444, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12046.0] + - - [1024, 3975, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11716.0] + - - [1024, 3870, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11438.0] + - - [4096, 3465, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12048.0] + - - [1024, 3523, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11370.0] + - - [4096, 3990, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12008.0] + - - [1024, 3549, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11356.0] + - - [1024, 3342, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10915.0] + - - [4096, 3476, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12060.0] + - - [1024, 3418, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11990.0] + - - [1024, 3859, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10915.0] + - - [4096, 3339, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11875.0] + - - [4096, 3452, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12286.0] + - - [4096, 3293, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12264.0] + - - [1024, 3369, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11725.0] + - - [1024, 3544, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10840.0] + - - [4096, 3493, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12116.0] + - - [4096, 3350, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11983.0] + - - [4096, 3256, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12161.0] + - - [1024, 3870, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11537.0] + - - [4096, 4012, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12073.0] + - - [1024, 3280, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11341.0] + - - [4096, 3456, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12296.0] + - - [1024, 3555, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11400.0] + - - [4096, 3014, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12063.0] + - - [1024, 3474, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11524.0] + - - [4096, 3367, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11976.0] + - - [4096, 3432, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12239.0] + - - [4096, 3273, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12167.0] + - - [4096, 3130, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12261.0] + - - [1024, 2984, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11404.0] + - - [1024, 3995, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11530.0] + - - [1024, 3517, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11318.0] + - - [1024, 3455, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11067.0] + - - [1024, 3939, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11651.0] + - - [4096, 3147, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12247.0] + - - [4096, 3516, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12247.0] + - - [1024, 3876, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11518.0] + - - [1024, 3191, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12088.0] + - - [4096, 3411, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12163.0] + - - [1024, 3337, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11552.0] + - - [1024, 3512, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10749.0] + - - [4096, 3301, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12266.0] + - - [1024, 3450, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12057.0] + - - [4096, 3533, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12250.0] + - - [4096, 3390, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12115.0] + - - [4096, 3231, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12080.0] + - - [1024, 2499, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11852.0] + - - [1024, 3186, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12079.0] + - - [1024, 3380, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10993.0] + - - [4096, 3496, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12124.0] + - - [1024, 3956, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11695.0] + - - [1024, 3976, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11376.0] + - - [4096, 2736, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11913.0] + - - [1024, 3291, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11491.0] + - - [1024, 3944, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11657.0] + - - [1024, 3485, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11574.0] + - - [4096, 3138, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12166.0] + - - [1024, 3423, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11854.0] + - - [1024, 3491, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10694.0] + - - [1024, 3860, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10153.0] + - - [4096, 3211, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11921.0] + - - [1024, 3221, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11148.0] + - - [1024, 2917, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11051.0] + - - [4096, 3475, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12017.0] + - - [4096, 3524, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12241.0] + - - [4096, 2985, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11971.0] + - - [1024, 3480, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11142.0] + - - [4096, 3222, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11955.0] + - - [4096, 3451, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12274.0] + - - [1024, 3969, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11717.0] + - - [1024, 3640, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11344.0] + - - [1024, 3297, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11486.0] + - - [4096, 3944, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12298.0] + - - [1024, 3216, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11112.0] + - - [4096, 3349, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11901.0] + - - [4096, 3398, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12089.0] + - - [1024, 3154, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11889.0] + - - [1024, 3978, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11730.0] + - - [1024, 3348, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11622.0] + - - [4096, 3304, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12219.0] + - - [4096, 4030, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12151.0] + - - [1024, 4026, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11651.0] + - - [4096, 3471, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12084.0] + - - [1024, 3259, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11239.0] + - - [1024, 3308, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11083.0] + - - [4096, 3391, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12045.0] + - - [1024, 3312, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11448.0] + - - [1024, 3502, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10726.0] + - - [1024, 3968, 1, 33708, 1024, 1024, 1024, 33708] + - [8, 11706.0] + - - [1024, 3424, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11865.0] + - - [4096, 4032, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12119.0] + - - [1024, 3900, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11309.0] + - - [4096, 3442, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12230.0] + - - [1024, 3366, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11700.0] + - - [4096, 3999, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12017.0] + - - [1024, 3477, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11522.0] + - - [1024, 2505, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11745.0] + - - [4096, 3515, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12170.0] + - - [1024, 3564, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11386.0] + - - [4096, 3057, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12229.0] + - - [1024, 3339, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11550.0] + - - [4096, 3262, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12137.0] + - - [1024, 4030, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11860.0] + - - [1024, 3265, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11338.0] + - - [1024, 3459, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10634.0] + - - [4096, 3462, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12022.0] + - - [1024, 3513, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11654.0] + - - [1024, 3397, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11004.0] + - - [4096, 3572, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12366.0] + - - [4096, 3389, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12079.0] + - - [4096, 3438, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12264.0] + - - [1024, 3640, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 11671.0] + - - [1024, 3995, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11789.0] + - - [1024, 3165, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12112.0] + - - [4096, 3543, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12281.0] + - - [4096, 3352, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11970.0] + - - [1024, 3359, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11592.0] + - - [1024, 3470, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10698.0] + - - [1024, 3392, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11013.0] + - - [4096, 3137, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12219.0] + - - [4096, 3506, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12219.0] + - - [1024, 3095, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11699.0] + - - [1024, 3859, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11462.0] + - - [4096, 3369, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11971.0] + - - [1024, 3435, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11867.0] + - - [1024, 3354, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10917.0] + - - [1024, 3055, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11479.0] + - - [4096, 3523, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12213.0] + - - [4096, 3380, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12032.0] + - - [1024, 3233, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11195.0] + - - [4096, 3221, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11971.0] + - - [4096, 3270, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12174.0] + - - [4096, 3593, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11990.0] + - - [1024, 3358, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11599.0] + - - [1024, 3540, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11411.0] + - - [4096, 3502, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12209.0] + - - [4096, 2505, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12214.0] + - - [4096, 3397, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12158.0] + - - [1024, 3300, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11513.0] + - - [4096, 3095, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12045.0] + - - [1024, 3182, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 12024.0] + - - [1024, 3299, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 11080.0] + - - [1024, 3276, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11066.0] + - - [1024, 3360, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10991.0] + - - [4096, 3360, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11942.0] + - - [4096, 2918, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12320.0] + - - [1024, 3939, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11635.0] + - - [4096, 3314, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12328.0] + - - [1024, 3319, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11541.0] + - - [1024, 3942, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11082.0] + - - [1024, 3465, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10637.0] + - - [4096, 3546, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12271.0] + - - [1024, 3403, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11828.0] + - - [1024, 3948, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11201.0] + - - [4096, 3441, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12251.0] + - - [1024, 3139, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11885.0] + - - [1024, 3563, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10787.0] + - - [1024, 3508, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10676.0] + - - [1024, 3975, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11729.0] + - - [1024, 3446, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11953.0] + - - [1024, 3529, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10818.0] + - - [4096, 3461, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12009.0] + - - [1024, 3574, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11524.0] + - - [1024, 3101, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11505.0] + - - [1024, 3927, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11129.0] + - - [4096, 3224, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11980.0] + - - [4096, 3437, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12250.0] + - - [4096, 3900, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12242.0] + - - [1024, 3495, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11613.0] + - - [1024, 3977, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11734.0] + - - [1024, 3328, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11575.0] + - - [4096, 3168, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12324.0] + - - [1024, 4026, 1, 33708, 1024, 1024, 1024, 33708] + - [4, 11928.0] + - - [1024, 3292, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11478.0] + - - [1024, 3294, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10979.0] + - - [4096, 3335, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11835.0] + - - [4096, 3400, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12111.0] + - - [1024, 3287, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11373.0] + - - [1024, 3910, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11638.0] + - - [1024, 3780, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11731.0] + - - [4096, 3098, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12063.0] + - - [1024, 3584, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 11532.0] + - - [1024, 3371, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11699.0] + - - [1024, 3546, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10700.0] + - - [1024, 4012, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11176.0] + - - [4096, 3505, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12153.0] + - - [4096, 3554, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12339.0] + - - [4096, 3063, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12273.0] + - - [1024, 3900, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11515.0] + - - [1024, 3345, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11623.0] + - - [1024, 3357, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10949.0] + - - [1024, 3282, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11047.0] + - - [4096, 3484, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12052.0] + - - [1024, 3557, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11469.0] + - - [1024, 3476, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10664.0] + - - [1024, 3751, 1, 1024, 1024, 1024, 1024, 1024] + - [39, 11256.0] + - - [4096, 3379, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12008.0] + - - [4096, 3428, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12212.0] + - - [4096, 3126, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12222.0] + - - [1024, 3325, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11472.0] + - - [4096, 3501, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12124.0] + - - [4096, 3358, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11966.0] + - - [1024, 3441, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11831.0] + - - [1024, 3552, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10635.0] + - - [4096, 3232, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12000.0] + - - [1024, 3412, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11750.0] + - - [1024, 3372, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11042.0] + - - [1024, 3585, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10732.0] + - - [4096, 3143, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12221.0] + - - [4096, 3464, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12068.0] + - - [1024, 3145, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11840.0] + - - [4096, 3375, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11972.0] + - - [4096, 2917, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12299.0] + - - [4096, 3978, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12012.0] + - - [1024, 2765, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11720.0] + - - [1024, 3452, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11939.0] + - - [4096, 3584, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12417.0] + - - [4096, 3545, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12349.0] + - - [1024, 3352, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11577.0] + - - [4096, 3292, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12236.0] + - - [1024, 3525, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11387.0] + - - [1024, 3266, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11070.0] + - - [1024, 3382, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10953.0] + - - [4096, 3492, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12103.0] + - - [4096, 3419, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12202.0] + - - [1024, 3796, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 12162.0] + - - [1024, 3293, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11479.0] + - - [4096, 3796, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12306.0] + - - [1024, 3487, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11187.0] + - - [4096, 3166, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12299.0] + - - [1024, 3409, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11780.0] + - - [1024, 3520, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10757.0] + - - [1024, 3573, 1, 4096, 1024, 1024, 1024, 4096] + - [19, 10872.0] + - - [4096, 3366, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11983.0] + - - [4096, 3720, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12149.0] + - - [4096, 3207, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11935.0] + - - [4096, 3272, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12217.0] + - - [1024, 3390, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11767.0] + - - [4096, 3183, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12395.0] + - - [4096, 3536, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12318.0] + - - [4096, 3563, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12395.0] + - - [1024, 3482, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11568.0] + - - [4096, 3447, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12247.0] + - - [4096, 3955, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12340.0] + - - [4096, 4005, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12057.0] + - - [1024, 3493, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11204.0] + - - [4096, 3410, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12126.0] + - - [1024, 3422, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11894.0] + - - [1024, 3350, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10866.0] + - - [4096, 3300, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12234.0] + - - [4096, 3910, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12256.0] + - - [1024, 3489, 1, 4096, 1024, 1024, 1024, 4096] + - [0, 11229.0] + - - [4096, 3483, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12066.0] + - - [4096, 3532, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12291.0] + - - [4096, 3230, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12047.0] + - - [4096, 3427, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12229.0] + - - [1024, 3377, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11674.0] + - - [1024, 3488, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10679.0] + - - [1024, 3616, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10728.0] + - - [1024, 3426, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 11085.0] + - - [4096, 3357, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11952.0] + - - [4096, 3406, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12146.0] + - - [1024, 3046, 1, 4096, 1024, 1024, 1024, 4096] + - [12, 11625.0] + - - [1024, 3272, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11472.0] + - - [1024, 3256, 1, 4096, 1024, 1024, 1024, 4096] + - [15, 11198.0] + - - [4096, 3247, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12068.0] + - - [4096, 3088, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [1024, 3531, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11269.0] + - - [4096, 3511, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12185.0] + - - [1024, 3720, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 11924.0] + - - [1024, 3267, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11431.0] + - - [1024, 3270, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11006.0] + - - [1024, 3461, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10587.0] + - - [4096, 3474, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12039.0] + - - [4096, 2984, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11981.0] + - - [1024, 3399, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11809.0] + - - [4096, 3574, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12383.0] + - - [1024, 3876, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11234.0] + - - [4096, 3337, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11919.0] + - - [4096, 3450, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12295.0] + - - [1024, 3720, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11533.0] + - - [1024, 4059, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11715.0] + - - [4096, 3291, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12287.0] + - - [4096, 3995, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12075.0] + - - [4096, 3491, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12109.0] + - - [4096, 3348, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11959.0] + - - [4096, 3925, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12280.0] + - - [4096, 3894, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12190.0] + - - [1024, 3456, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11969.0] + - - [1024, 3394, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11115.0] + - - [4096, 3165, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12238.0] + - - [4096, 3470, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12078.0] + - - [1024, 3014, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11478.0] + - - [1024, 3375, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10918.0] + - - [4096, 3859, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12041.0] + - - [4096, 3365, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12001.0] + - - [1024, 3162, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11933.0] + - - [1024, 3840, 1, 33708, 1024, 1024, 1024, 33708] + - [6, 12302.0] + - - [1024, 3437, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11930.0] + - - [4096, 3319, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12312.0] + - - [1024, 3320, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11559.0] + - - [4096, 3328, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12363.0] + - - [1024, 3235, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11198.0] + - - [4096, 3282, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12189.0] + - - [1024, 3367, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11702.0] + - - [1024, 3542, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 10838.0] + - - [4096, 3145, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12234.0] + - - [4096, 3514, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12233.0] + - - [1024, 3432, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11895.0] + - - [4096, 3409, 1, 1024, 4096, 4096, 4096, 1024] + - [22, 12131.0] + - - [1024, 4012, 1, 33708, 1024, 1024, 1024, 33708] + - [16, 11847.0] + - - [4096, 3876, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12160.0] + - - [4096, 3299, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12309.0] + - - [1024, 3168, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12141.0] + - - [4096, 3681, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12217.0] + - - [4096, 3531, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12287.0] + - - [4096, 3388, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [1024, 3720, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11875.0] + - - [1024, 3332, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10869.0] + - - [1024, 3273, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11032.0] + - - [1024, 2935, 1, 4096, 1024, 1024, 1024, 4096] + - [27, 11099.0] + - - [1024, 3467, 1, 4096, 1024, 1024, 1024, 4096] + - [39, 10682.0] + - - [4096, 3542, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12268.0] + - - [1024, 3130, 1, 4096, 1024, 1024, 1024, 4096] + - [4, 11813.0] + - - [1024, 3405, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11008.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11181.0] + - - [4096, 3405, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12113.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12193.0] + - - [36548, 1216, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 11927.0] + - - [1024, 2592, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10916.0] + - - [1024, 1568, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10877.0] + - - [1024, 4445, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11854.0] + - - [1024, 6272, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 12002.0] + - - [36548, 3584, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12437.0] + - - [1024, 1827, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10706.0] + - - [1024, 3220, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11110.0] + - - [1024, 1856, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11073.0] + - - [1024, 1760, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10206.0] + - - [36548, 4235, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12137.0] + - - [1024, 1984, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10811.0] + - - [1024, 14720, 1, 1024, 1024, 1024, 1024, 1024] + - [25, 12332.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10214.0] + - - [36548, 14976, 1, 1024, 36548, 36548, 36548, 1024] + - [11, 12541.0] + - - [36548, 1152, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12470.0] + - - [1024, 3392, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11488.0] + - - [1024, 1408, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10805.0] + - - [1024, 2080, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10550.0] + - - [1024, 1824, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10640.0] + - - [36548, 2432, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12556.0] + - - [36548, 1827, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 11950.0] + - - [1024, 10176, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12366.0] + - - [1024, 1952, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10662.0] + - - [1024, 17024, 1, 1024, 1024, 1024, 1024, 1024] + - [25, 12260.0] + - - [1024, 1472, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10319.0] + - - [36548, 4459, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12393.0] + - - [1024, 3712, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11564.0] + - - [36548, 12928, 1, 1024, 36548, 36548, 36548, 1024] + - [25, 12319.0] + - - [1024, 1632, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 9839.0] + - - [1024, 1696, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10527.0] + - - [36548, 1764, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12336.0] + - - [1024, 2944, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11165.0] + - - [36548, 14080, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 12589.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10358.0] + - - [1024, 13440, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 12321.0] + - - [36548, 9120, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12332.0] + - - [1024, 3008, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11011.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11434.0] + - - [1024, 2208, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10666.0] + - - [1024, 1920, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11221.0] + - - [36548, 2496, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12265.0] + - - [1024, 2016, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10936.0] + - - [1024, 1184, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10325.0] + - - [1024, 1664, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10386.0] + - - [1024, 11424, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12222.0] + - - [1024, 1216, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10529.0] + - - [36548, 3185, 1, 1024, 36548, 36548, 36548, 1024] + - [39, 12430.0] + - - [36548, 9216, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12453.0] + - - [1024, 3200, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 11708.0] + - - [1024, 2656, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11083.0] + - - [1024, 2368, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11234.0] + - - [1024, 4459, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 11663.0] + - - [1024, 3808, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11779.0] + - - [1024, 2336, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11070.0] + - - [1024, 2304, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10708.0] + - - [1024, 1560, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10511.0] + - - [1024, 2496, 1, 1024, 1024, 1024, 1024, 1024] + - [25, 11083.0] + - - [1024, 1504, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10428.0] + - - [1024, 3232, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10856.0] + - - [36548, 1015, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 12354.0] + - - [1024, 2000, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10891.0] + - - [36548, 243, 1, 1024, 36548, 36548, 36548, 1024] + - [4, 11487.0] + - - [1024, 13184, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12312.0] + - - [1024, 2688, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11354.0] + - - [36548, 950, 1, 1024, 36548, 36548, 36548, 1024] + - [15, 11734.0] + - - [1024, 1764, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10491.0] + - - [1024, 1376, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10466.0] + - - [36548, 774, 1, 1024, 36548, 36548, 36548, 1024] + - [15, 11027.0] + - - [1024, 4256, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11482.0] + - - [36548, 3712, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12427.0] + - - [1024, 3360, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11159.0] + - - [1024, 2784, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11311.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 11891.0] + - - [36548, 1102, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 11952.0] + - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11021.0] + - - [1024, 2720, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10987.0] + - - [1024, 2752, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11187.0] + - - [1024, 2816, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 11588.0] + - - [1024, 2624, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11073.0] + - - [1024, 2144, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10849.0] + - - [36548, 1131, 1, 1024, 36548, 36548, 36548, 1024] + - [4, 12254.0] + - - [1024, 3296, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11279.0] + - - [36548, 4992, 1, 1024, 36548, 36548, 36548, 1024] + - [39, 12501.0] + - - [1024, 1344, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10294.0] + - - [36548, 2401, 1, 1024, 36548, 36548, 36548, 1024] + - [36, 12383.0] + - - [1024, 15744, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12352.0] + - - [1024, 15232, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12369.0] + - - [1024, 1888, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11328.0] + - - [1024, 1792, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10884.0] + - - [36548, 1073, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 11647.0] + - - [36548, 15488, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12326.0] + - - [1024, 2464, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10458.0] + - - [1024, 2272, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 10539.0] + - - [1024, 2432, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 10978.0] + - - [1024, 3936, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11179.0] + - - [36548, 13824, 1, 1024, 36548, 36548, 36548, 1024] + - [32, 12553.0] + - - [1024, 2401, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11120.0] + - - [1024, 2176, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11532.0] + - - [1024, 2240, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11453.0] + - - [1024, 1728, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11157.0] + - - [1024, 2528, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11712.0] + - - [1024, 2400, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11144.0] + - - [1024, 1440, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10379.0] + - - [1024, 2912, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 11104.0] + - - [1024, 2880, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11556.0] + - - [1024, 4064, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11894.0] + - - [1024, 4655, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11880.0] + - - [36548, 6272, 1, 1024, 36548, 36548, 36548, 1024] + - [22, 12592.0] + - - [768, 2048, 1, 3072, 768, 768, 768, 3072] + - [22, 11393.0] + - - [768, 4096, 1, 3072, 768, 768, 768, 3072] + - [22, 11836.0] + - - [6272, 256, 1, 528, 6272, 6272, 6272, 528] + - [1, 11247.0] + - - [3136, 2048, 1, 1024, 3136, 3136, 3136, 1024] + - [8, 12163.0] + - - [50176, 128, 1, 256, 50176, 50176, 50176, 256] + - [36, 11684.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [8, 12039.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [1, 11915.0] + - - [3136, 512, 1, 1024, 3136, 3136, 3136, 1024] + - [1, 11171.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [8, 11999.0] + - - [289, 384, 32, 1024, 289, 289, 289, 1024] + - [36, 8760.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [0, 11433.0] + - - [50176, 512, 1, 256, 50176, 50176, 50176, 256] + - [11, 12227.0] + - - [12544, 1024, 1, 512, 12544, 12544, 12544, 512] + - [22, 12227.0] + - - [12544, 256, 1, 512, 12544, 12544, 12544, 512] + - [1, 11756.0] + - - [784, 128, 32, 256, 784, 784, 784, 256] + - [21, 9669.0] + - - [4096, 512, 1, 9216, 4096, 4096, 4096, 9216] + - [21, 11288.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [1, 11422.0] + - - [1225, 192, 32, 384, 1225, 1225, 1225, 384] + - [28, 11130.0] + - - [8192, 320, 1, 1280, 8192, 8192, 8192, 1280] + - [28, 11575.0] + - - [8192, 320, 1, 2048, 8192, 8192, 8192, 2048] + - [28, 11612.0] + - - [8192, 384, 1, 1280, 8192, 8192, 8192, 1280] + - [1, 11749.0] + - - [8192, 384, 1, 2048, 8192, 8192, 8192, 2048] + - [1, 11802.0] + - - [8192, 448, 1, 2048, 8192, 8192, 8192, 2048] + - [15, 11448.0] + - - [8192, 448, 1, 1280, 8192, 8192, 8192, 1280] + - [15, 11431.0] + - - [256, 6400, 1, 4096, 256, 256, 256, 4096] + - [39, 11692.0] + - - [512, 3433, 1, 2048, 512, 512, 512, 2048] + - [35, 11135.0] + - - [512, 3439, 1, 2048, 512, 512, 512, 2048] + - [35, 11174.0] + - - [512, 3461, 1, 2048, 512, 512, 512, 2048] + - [7, 11246.0] + - - [512, 3479, 1, 2048, 512, 512, 512, 2048] + - [7, 11285.0] + - - [512, 3494, 1, 2048, 512, 512, 512, 2048] + - [35, 11350.0] + - - [512, 3520, 1, 2048, 512, 512, 512, 2048] + - [7, 11446.0] + - - [512, 3530, 1, 2048, 512, 512, 512, 2048] + - [8, 10999.0] + - - [512, 3541, 1, 2048, 512, 512, 512, 2048] + - [8, 11034.0] + - - [512, 3564, 1, 2048, 512, 512, 512, 2048] + - [22, 11099.0] + - - [512, 3776, 1, 2048, 512, 512, 512, 2048] + - [8, 11720.0] + - - [512, 3859, 1, 512, 512, 512, 512, 512] + - [35, 10566.0] + - - [512, 3925, 1, 2048, 512, 512, 512, 2048] + - [7, 10885.0] + - - [512, 3944, 1, 2048, 512, 512, 512, 2048] + - [7, 10947.0] + - - [512, 3955, 1, 2048, 512, 512, 512, 2048] + - [35, 10987.0] + - - [512, 3969, 1, 2048, 512, 512, 512, 2048] + - [35, 11030.0] + - - [512, 3976, 1, 2048, 512, 512, 512, 2048] + - [35, 11031.0] + - - [2048, 1232, 1, 512, 2048, 2048, 2048, 512] + - [1, 11049.0] + - - [2048, 3165, 1, 512, 2048, 2048, 2048, 512] + - [1, 11978.0] + - - [512, 2387, 1, 512, 512, 512, 512, 512] + - [23, 10224.0] + - - [512, 2418, 1, 512, 512, 512, 512, 512] + - [35, 10270.0] + - - [512, 2418, 1, 2048, 512, 512, 512, 2048] + - [27, 11021.0] + - - [512, 2496, 1, 512, 512, 512, 512, 512] + - [21, 10776.0] + - - [512, 2496, 1, 2048, 512, 512, 512, 2048] + - [27, 11350.0] + - - [512, 2790, 1, 2048, 512, 512, 512, 2048] + - [7, 10931.0] + - - [512, 2864, 1, 2048, 512, 512, 512, 2048] + - [35, 11201.0] + - - [512, 3092, 1, 2048, 512, 512, 512, 2048] + - [36, 11381.0] + - - [512, 3113, 1, 2048, 512, 512, 512, 2048] + - [8, 11456.0] + - - [512, 3137, 1, 2048, 512, 512, 512, 2048] + - [8, 11577.0] + - - [512, 3165, 1, 2048, 512, 512, 512, 2048] + - [22, 11617.0] + - - [512, 3166, 1, 2048, 512, 512, 512, 2048] + - [36, 11630.0] + - - [512, 3194, 1, 2048, 512, 512, 512, 2048] + - [8, 11728.0] + - - [512, 3219, 1, 2048, 512, 512, 512, 2048] + - [7, 10515.0] + - - [512, 3222, 1, 2048, 512, 512, 512, 2048] + - [7, 10547.0] + - - [512, 3234, 1, 2048, 512, 512, 512, 2048] + - [21, 10584.0] + - - [512, 3237, 1, 2048, 512, 512, 512, 2048] + - [7, 10594.0] + - - [512, 3242, 1, 2048, 512, 512, 512, 2048] + - [7, 10604.0] + - - [512, 3246, 1, 2048, 512, 512, 512, 2048] + - [35, 10592.0] + - - [512, 3249, 1, 2048, 512, 512, 512, 2048] + - [35, 10642.0] + - - [512, 3251, 1, 2048, 512, 512, 512, 2048] + - [35, 10614.0] + - - [512, 3257, 1, 2048, 512, 512, 512, 2048] + - [21, 10661.0] + - - [512, 3262, 1, 2048, 512, 512, 512, 2048] + - [7, 10660.0] + - - [512, 3268, 1, 2048, 512, 512, 512, 2048] + - [35, 10681.0] + - - [512, 3282, 1, 2048, 512, 512, 512, 2048] + - [21, 10757.0] + - - [512, 3286, 1, 2048, 512, 512, 512, 2048] + - [21, 10762.0] + - - [512, 3287, 1, 2048, 512, 512, 512, 2048] + - [35, 10778.0] + - - [512, 3293, 1, 2048, 512, 512, 512, 2048] + - [21, 10762.0] + - - [512, 3297, 1, 2048, 512, 512, 512, 2048] + - [35, 10769.0] + - - [512, 3307, 1, 2048, 512, 512, 512, 2048] + - [35, 10821.0] + - - [512, 3314, 1, 2048, 512, 512, 512, 2048] + - [21, 10847.0] + - - [512, 3315, 1, 2048, 512, 512, 512, 2048] + - [7, 10845.0] + - - [512, 3319, 1, 2048, 512, 512, 512, 2048] + - [35, 10852.0] + - - [512, 3322, 1, 2048, 512, 512, 512, 2048] + - [21, 10890.0] + - - [512, 3323, 1, 2048, 512, 512, 512, 2048] + - [35, 10893.0] + - - [512, 3324, 1, 2048, 512, 512, 512, 2048] + - [35, 10853.0] + - - [512, 3325, 1, 2048, 512, 512, 512, 2048] + - [35, 10885.0] + - - [512, 3327, 1, 2048, 512, 512, 512, 2048] + - [7, 10832.0] + - - [512, 3329, 1, 2048, 512, 512, 512, 2048] + - [21, 10847.0] + - - [512, 3332, 1, 2048, 512, 512, 512, 2048] + - [35, 10858.0] + - - [512, 3336, 1, 2048, 512, 512, 512, 2048] + - [21, 10876.0] + - - [512, 3339, 1, 2048, 512, 512, 512, 2048] + - [7, 10896.0] + - - [512, 3342, 1, 2048, 512, 512, 512, 2048] + - [21, 10920.0] + - - [512, 3344, 1, 2048, 512, 512, 512, 2048] + - [21, 10919.0] + - - [512, 3358, 1, 2048, 512, 512, 512, 2048] + - [35, 10954.0] + - - [512, 3360, 1, 2048, 512, 512, 512, 2048] + - [7, 10963.0] + - - [512, 3364, 1, 2048, 512, 512, 512, 2048] + - [7, 10939.0] + - - [512, 3365, 1, 2048, 512, 512, 512, 2048] + - [21, 10974.0] + - - [512, 3369, 1, 2048, 512, 512, 512, 2048] + - [35, 10991.0] + - - [512, 3370, 1, 2048, 512, 512, 512, 2048] + - [21, 10974.0] + - - [512, 3371, 1, 2048, 512, 512, 512, 2048] + - [35, 10958.0] + - - [512, 3374, 1, 2048, 512, 512, 512, 2048] + - [7, 10972.0] + - - [512, 3376, 1, 2048, 512, 512, 512, 2048] + - [21, 11001.0] + - - [512, 3377, 1, 2048, 512, 512, 512, 2048] + - [7, 11005.0] + - - [512, 3378, 1, 2048, 512, 512, 512, 2048] + - [7, 11002.0] + - - [512, 3381, 1, 2048, 512, 512, 512, 2048] + - [7, 11025.0] + - - [512, 3382, 1, 2048, 512, 512, 512, 2048] + - [7, 11017.0] + - - [512, 3383, 1, 2048, 512, 512, 512, 2048] + - [35, 11034.0] + - - [512, 3384, 1, 2048, 512, 512, 512, 2048] + - [7, 11040.0] + - - [512, 3385, 1, 2048, 512, 512, 512, 2048] + - [21, 11038.0] + - - [512, 3386, 1, 2048, 512, 512, 512, 2048] + - [7, 11022.0] + - - [512, 3388, 1, 2048, 512, 512, 512, 2048] + - [21, 11034.0] + - - [512, 3390, 1, 2048, 512, 512, 512, 2048] + - [35, 11049.0] + - - [512, 3391, 1, 2048, 512, 512, 512, 2048] + - [35, 11047.0] + - - [512, 3396, 1, 2048, 512, 512, 512, 2048] + - [21, 11039.0] + - - [512, 3399, 1, 2048, 512, 512, 512, 2048] + - [21, 11071.0] + - - [512, 3402, 1, 2048, 512, 512, 512, 2048] + - [21, 11059.0] + - - [512, 3410, 1, 2048, 512, 512, 512, 2048] + - [35, 11098.0] + - - [512, 3412, 1, 2048, 512, 512, 512, 2048] + - [7, 11115.0] + - - [512, 3414, 1, 2048, 512, 512, 512, 2048] + - [7, 11124.0] + - - [512, 3415, 1, 2048, 512, 512, 512, 2048] + - [21, 11126.0] + - - [512, 3418, 1, 2048, 512, 512, 512, 2048] + - [35, 11117.0] + - - [512, 3420, 1, 2048, 512, 512, 512, 2048] + - [35, 11123.0] + - - [512, 3422, 1, 2048, 512, 512, 512, 2048] + - [21, 11144.0] + - - [512, 3425, 1, 2048, 512, 512, 512, 2048] + - [35, 11140.0] + - - [512, 3426, 1, 2048, 512, 512, 512, 2048] + - [7, 11131.0] + - - [512, 3427, 1, 2048, 512, 512, 512, 2048] + - [7, 11149.0] + - - [512, 3428, 1, 2048, 512, 512, 512, 2048] + - [21, 11167.0] + - - [512, 3430, 1, 2048, 512, 512, 512, 2048] + - [21, 11145.0] + - - [512, 3431, 1, 2048, 512, 512, 512, 2048] + - [7, 11162.0] + - - [512, 3432, 1, 2048, 512, 512, 512, 2048] + - [21, 11153.0] + - - [512, 3438, 1, 2048, 512, 512, 512, 2048] + - [21, 11170.0] + - - [512, 3440, 1, 2048, 512, 512, 512, 2048] + - [7, 11198.0] + - - [512, 3443, 1, 2048, 512, 512, 512, 2048] + - [7, 11191.0] + - - [512, 3445, 1, 2048, 512, 512, 512, 2048] + - [21, 11191.0] + - - [512, 3447, 1, 2048, 512, 512, 512, 2048] + - [21, 11211.0] + - - [512, 3448, 1, 2048, 512, 512, 512, 2048] + - [35, 11206.0] + - - [512, 3450, 1, 2048, 512, 512, 512, 2048] + - [35, 11218.0] + - - [512, 3451, 1, 2048, 512, 512, 512, 2048] + - [35, 11230.0] + - - [512, 3452, 1, 2048, 512, 512, 512, 2048] + - [7, 11236.0] + - - [512, 3453, 1, 2048, 512, 512, 512, 2048] + - [7, 11235.0] + - - [512, 3455, 1, 2048, 512, 512, 512, 2048] + - [7, 11225.0] + - - [512, 3456, 1, 2048, 512, 512, 512, 2048] + - [21, 11282.0] + - - [512, 3457, 1, 2048, 512, 512, 512, 2048] + - [21, 11265.0] + - - [512, 3458, 1, 2048, 512, 512, 512, 2048] + - [35, 11240.0] + - - [512, 3459, 1, 2048, 512, 512, 512, 2048] + - [21, 11259.0] + - - [512, 3460, 1, 2048, 512, 512, 512, 2048] + - [7, 11251.0] + - - [512, 3462, 1, 2048, 512, 512, 512, 2048] + - [21, 11259.0] + - - [512, 3466, 1, 2048, 512, 512, 512, 2048] + - [35, 11309.0] + - - [512, 3467, 1, 2048, 512, 512, 512, 2048] + - [21, 11290.0] + - - [512, 3468, 1, 2048, 512, 512, 512, 2048] + - [35, 11292.0] + - - [512, 3470, 1, 2048, 512, 512, 512, 2048] + - [21, 11296.0] + - - [512, 3471, 1, 2048, 512, 512, 512, 2048] + - [21, 11304.0] + - - [512, 3472, 1, 2048, 512, 512, 512, 2048] + - [7, 11300.0] + - - [512, 3475, 1, 2048, 512, 512, 512, 2048] + - [35, 11320.0] + - - [512, 3476, 1, 2048, 512, 512, 512, 2048] + - [21, 11334.0] + - - [512, 3477, 1, 2048, 512, 512, 512, 2048] + - [7, 11299.0] + - - [512, 3478, 1, 2048, 512, 512, 512, 2048] + - [21, 11280.0] + - - [512, 3480, 1, 2048, 512, 512, 512, 2048] + - [7, 11333.0] + - - [512, 3481, 1, 2048, 512, 512, 512, 2048] + - [35, 11343.0] + - - [512, 3483, 1, 2048, 512, 512, 512, 2048] + - [21, 11331.0] + - - [512, 3484, 1, 2048, 512, 512, 512, 2048] + - [35, 11315.0] + - - [512, 3487, 1, 2048, 512, 512, 512, 2048] + - [35, 11344.0] + - - [512, 3489, 1, 2048, 512, 512, 512, 2048] + - [35, 11370.0] + - - [512, 3490, 1, 2048, 512, 512, 512, 2048] + - [35, 11366.0] + - - [512, 3491, 1, 2048, 512, 512, 512, 2048] + - [21, 11375.0] + - - [512, 3493, 1, 2048, 512, 512, 512, 2048] + - [7, 11350.0] + - - [512, 3495, 1, 2048, 512, 512, 512, 2048] + - [35, 11343.0] + - - [512, 3497, 1, 2048, 512, 512, 512, 2048] + - [21, 11400.0] + - - [512, 3498, 1, 2048, 512, 512, 512, 2048] + - [35, 11379.0] + - - [512, 3499, 1, 2048, 512, 512, 512, 2048] + - [21, 11394.0] + - - [512, 3501, 1, 2048, 512, 512, 512, 2048] + - [7, 11378.0] + - - [512, 3503, 1, 2048, 512, 512, 512, 2048] + - [35, 11399.0] + - - [512, 3505, 1, 2048, 512, 512, 512, 2048] + - [21, 11398.0] + - - [512, 3507, 1, 2048, 512, 512, 512, 2048] + - [21, 11402.0] + - - [512, 3508, 1, 2048, 512, 512, 512, 2048] + - [35, 11402.0] + - - [512, 3509, 1, 2048, 512, 512, 512, 2048] + - [21, 11406.0] + - - [512, 3510, 1, 2048, 512, 512, 512, 2048] + - [21, 11421.0] + - - [512, 3511, 1, 2048, 512, 512, 512, 2048] + - [7, 11411.0] + - - [512, 3513, 1, 2048, 512, 512, 512, 2048] + - [35, 11412.0] + - - [512, 3514, 1, 2048, 512, 512, 512, 2048] + - [35, 11411.0] + - - [512, 3515, 1, 2048, 512, 512, 512, 2048] + - [21, 11431.0] + - - [512, 3517, 1, 2048, 512, 512, 512, 2048] + - [35, 11417.0] + - - [512, 3518, 1, 2048, 512, 512, 512, 2048] + - [21, 11436.0] + - - [512, 3519, 1, 2048, 512, 512, 512, 2048] + - [21, 11442.0] + - - [512, 3523, 1, 2048, 512, 512, 512, 2048] + - [22, 10992.0] + - - [512, 3528, 1, 2048, 512, 512, 512, 2048] + - [22, 10994.0] + - - [512, 3529, 1, 2048, 512, 512, 512, 2048] + - [8, 10997.0] + - - [512, 3531, 1, 2048, 512, 512, 512, 2048] + - [8, 10998.0] + - - [512, 3532, 1, 2048, 512, 512, 512, 2048] + - [22, 11021.0] + - - [512, 3533, 1, 2048, 512, 512, 512, 2048] + - [8, 11009.0] + - - [512, 3534, 1, 2048, 512, 512, 512, 2048] + - [8, 11014.0] + - - [512, 3538, 1, 2048, 512, 512, 512, 2048] + - [8, 11026.0] + - - [512, 3539, 1, 2048, 512, 512, 512, 2048] + - [22, 11048.0] + - - [512, 3540, 1, 2048, 512, 512, 512, 2048] + - [22, 11040.0] + - - [512, 3547, 1, 2048, 512, 512, 512, 2048] + - [8, 11061.0] + - - [512, 3548, 1, 2048, 512, 512, 512, 2048] + - [22, 11081.0] + - - [512, 3552, 1, 2048, 512, 512, 512, 2048] + - [8, 11086.0] + - - [512, 3575, 1, 2048, 512, 512, 512, 2048] + - [8, 11139.0] + - - [512, 3598, 1, 2048, 512, 512, 512, 2048] + - [22, 11203.0] + - - [512, 3599, 1, 2048, 512, 512, 512, 2048] + - [22, 11204.0] + - - [512, 3608, 1, 2048, 512, 512, 512, 2048] + - [8, 11225.0] + - - [512, 3776, 1, 512, 512, 512, 512, 512] + - [35, 11090.0] + - - [512, 3780, 1, 512, 512, 512, 512, 512] + - [22, 11010.0] + - - [512, 3780, 1, 2048, 512, 512, 512, 2048] + - [8, 11725.0] + - - [512, 3780, 1, 33708, 512, 512, 512, 33708] + - [20, 12085.0] + - - [512, 3796, 1, 512, 512, 512, 512, 512] + - [21, 10788.0] + - - [512, 3796, 1, 2048, 512, 512, 512, 2048] + - [8, 11787.0] + - - [512, 3796, 1, 33708, 512, 512, 512, 33708] + - [6, 12135.0] + - - [512, 3822, 1, 512, 512, 512, 512, 512] + - [35, 10843.0] + - - [512, 3822, 1, 2048, 512, 512, 512, 2048] + - [22, 11876.0] + - - [512, 3822, 1, 33708, 512, 512, 512, 33708] + - [20, 12219.0] + - - [512, 3835, 1, 512, 512, 512, 512, 512] + - [37, 10913.0] + - - [512, 3835, 1, 2048, 512, 512, 512, 2048] + - [22, 11873.0] + - - [512, 3840, 1, 512, 512, 512, 512, 512] + - [8, 11499.0] + - - [512, 3840, 1, 2048, 512, 512, 512, 2048] + - [8, 11989.0] + - - [512, 3840, 1, 33708, 512, 512, 512, 33708] + - [6, 12275.0] + - - [512, 3859, 1, 2048, 512, 512, 512, 2048] + - [7, 10723.0] + - - [512, 3859, 1, 33708, 512, 512, 512, 33708] + - [15, 10806.0] + - - [512, 3864, 1, 512, 512, 512, 512, 512] + - [35, 10471.0] + - - [512, 3864, 1, 2048, 512, 512, 512, 2048] + - [7, 10769.0] + - - [512, 3870, 1, 512, 512, 512, 512, 512] + - [35, 10552.0] + - - [512, 3870, 1, 2048, 512, 512, 512, 2048] + - [35, 10777.0] + - - [512, 3870, 1, 33708, 512, 512, 512, 33708] + - [0, 10849.0] + - - [512, 3876, 1, 512, 512, 512, 512, 512] + - [35, 10490.0] + - - [512, 3876, 1, 2048, 512, 512, 512, 2048] + - [35, 10821.0] + - - [512, 3876, 1, 33708, 512, 512, 512, 33708] + - [0, 10833.0] + - - [512, 3906, 1, 512, 512, 512, 512, 512] + - [35, 10534.0] + - - [512, 3906, 1, 2048, 512, 512, 512, 2048] + - [21, 10803.0] + - - [512, 3906, 1, 33708, 512, 512, 512, 33708] + - [0, 10875.0] + - - [512, 3910, 1, 512, 512, 512, 512, 512] + - [35, 10500.0] + - - [512, 3910, 1, 2048, 512, 512, 512, 2048] + - [35, 10829.0] + - - [512, 3910, 1, 33708, 512, 512, 512, 33708] + - [0, 10883.0] + - - [512, 3925, 1, 512, 512, 512, 512, 512] + - [35, 10501.0] + - - [512, 3925, 1, 33708, 512, 512, 512, 33708] + - [0, 10934.0] + - - [512, 3927, 1, 512, 512, 512, 512, 512] + - [35, 10511.0] + - - [512, 3942, 1, 512, 512, 512, 512, 512] + - [35, 10618.0] + - - [512, 3942, 1, 2048, 512, 512, 512, 2048] + - [21, 10875.0] + - - [512, 3942, 1, 33708, 512, 512, 512, 33708] + - [7, 10971.0] + - - [512, 3944, 1, 512, 512, 512, 512, 512] + - [35, 10595.0] + - - [512, 3944, 1, 33708, 512, 512, 512, 33708] + - [0, 10987.0] + - - [512, 3955, 1, 512, 512, 512, 512, 512] + - [35, 10556.0] + - - [512, 3955, 1, 33708, 512, 512, 512, 33708] + - [0, 10997.0] + - - [512, 3968, 1, 512, 512, 512, 512, 512] + - [35, 10702.0] + - - [512, 3968, 1, 2048, 512, 512, 512, 2048] + - [7, 10990.0] + - - [512, 3968, 1, 33708, 512, 512, 512, 33708] + - [0, 11034.0] + - - [512, 3969, 1, 512, 512, 512, 512, 512] + - [35, 10665.0] + - - [512, 3969, 1, 33708, 512, 512, 512, 33708] + - [28, 11024.0] + - - [512, 3976, 1, 512, 512, 512, 512, 512] + - [35, 10636.0] + - - [512, 3976, 1, 33708, 512, 512, 512, 33708] + - [0, 11039.0] + - - [512, 3977, 1, 512, 512, 512, 512, 512] + - [35, 10649.0] + - - [512, 3977, 1, 2048, 512, 512, 512, 2048] + - [21, 10966.0] + - - [512, 3977, 1, 33708, 512, 512, 512, 33708] + - [28, 11058.0] + - - [512, 3978, 1, 512, 512, 512, 512, 512] + - [35, 10626.0] + - - [512, 3978, 1, 2048, 512, 512, 512, 2048] + - [35, 11000.0] + - - [512, 3978, 1, 33708, 512, 512, 512, 33708] + - [28, 11106.0] + - - [512, 3990, 1, 512, 512, 512, 512, 512] + - [35, 10750.0] + - - [512, 3990, 1, 2048, 512, 512, 512, 2048] + - [35, 11067.0] + - - [512, 3990, 1, 33708, 512, 512, 512, 33708] + - [28, 11131.0] + - - [512, 3995, 1, 512, 512, 512, 512, 512] + - [35, 10754.0] + - - [512, 3995, 1, 2048, 512, 512, 512, 2048] + - [7, 11087.0] + - - [512, 3995, 1, 33708, 512, 512, 512, 33708] + - [28, 11150.0] + - - [512, 3996, 1, 512, 512, 512, 512, 512] + - [35, 10750.0] + - - [512, 3996, 1, 2048, 512, 512, 512, 2048] + - [35, 11084.0] + - - [512, 3996, 1, 33708, 512, 512, 512, 33708] + - [28, 11165.0] + - - [512, 3999, 1, 512, 512, 512, 512, 512] + - [35, 10763.0] + - - [512, 3999, 1, 2048, 512, 512, 512, 2048] + - [35, 11122.0] + - - [512, 3999, 1, 33708, 512, 512, 512, 33708] + - [28, 11164.0] + - - [512, 4005, 1, 512, 512, 512, 512, 512] + - [35, 10808.0] + - - [512, 4005, 1, 2048, 512, 512, 512, 2048] + - [35, 11093.0] + - - [512, 4005, 1, 33708, 512, 512, 512, 33708] + - [15, 11166.0] + - - [512, 4012, 1, 512, 512, 512, 512, 512] + - [35, 10785.0] + - - [512, 4012, 1, 2048, 512, 512, 512, 2048] + - [35, 11143.0] + - - [512, 4012, 1, 33708, 512, 512, 512, 33708] + - [28, 11195.0] + - - [512, 4020, 1, 512, 512, 512, 512, 512] + - [35, 10802.0] + - - [512, 4020, 1, 2048, 512, 512, 512, 2048] + - [35, 11158.0] + - - [512, 4020, 1, 33708, 512, 512, 512, 33708] + - [28, 11209.0] + - - [512, 4026, 1, 512, 512, 512, 512, 512] + - [35, 10807.0] + - - [512, 4026, 1, 2048, 512, 512, 512, 2048] + - [21, 11149.0] + - - [512, 4026, 1, 33708, 512, 512, 512, 33708] + - [28, 11221.0] + - - [512, 4030, 1, 512, 512, 512, 512, 512] + - [35, 10840.0] + - - [512, 4030, 1, 2048, 512, 512, 512, 2048] + - [21, 11167.0] + - - [512, 4030, 1, 33708, 512, 512, 512, 33708] + - [15, 11235.0] + - - [512, 4032, 1, 512, 512, 512, 512, 512] + - [35, 10924.0] + - - [512, 4032, 1, 2048, 512, 512, 512, 2048] + - [7, 11206.0] + - - [512, 4032, 1, 33708, 512, 512, 512, 33708] + - [28, 11250.0] + - - [512, 4050, 1, 512, 512, 512, 512, 512] + - [35, 10871.0] + - - [512, 4059, 1, 512, 512, 512, 512, 512] + - [35, 10970.0] + - - [2048, 644, 1, 512, 2048, 2048, 2048, 512] + - [15, 9847.0] + - - [2048, 668, 1, 512, 2048, 2048, 2048, 512] + - [15, 10122.0] + - - [2048, 714, 1, 512, 2048, 2048, 2048, 512] + - [28, 9974.0] + - - [2048, 720, 1, 512, 2048, 2048, 2048, 512] + - [16, 10064.0] + - - [2048, 722, 1, 512, 2048, 2048, 2048, 512] + - [28, 10051.0] + - - [2048, 781, 1, 512, 2048, 2048, 2048, 512] + - [28, 9939.0] + - - [2048, 848, 1, 512, 2048, 2048, 2048, 512] + - [1, 10107.0] + - - [2048, 872, 1, 512, 2048, 2048, 2048, 512] + - [16, 10318.0] + - - [2048, 936, 1, 512, 2048, 2048, 2048, 512] + - [28, 10888.0] + - - [2048, 980, 1, 512, 2048, 2048, 2048, 512] + - [15, 10662.0] + - - [2048, 1139, 1, 512, 2048, 2048, 2048, 512] + - [15, 10760.0] + - - [2048, 1184, 1, 512, 2048, 2048, 2048, 512] + - [1, 10716.0] + - - [2048, 1186, 1, 512, 2048, 2048, 2048, 512] + - [1, 10689.0] + - - [2048, 1279, 1, 512, 2048, 2048, 2048, 512] + - [1, 11333.0] + - - [2048, 1290, 1, 512, 2048, 2048, 2048, 512] + - [28, 10808.0] + - - [2048, 1327, 1, 512, 2048, 2048, 2048, 512] + - [28, 11077.0] + - - [2048, 1331, 1, 512, 2048, 2048, 2048, 512] + - [28, 11101.0] + - - [2048, 1341, 1, 512, 2048, 2048, 2048, 512] + - [28, 11165.0] + - - [2048, 1350, 1, 512, 2048, 2048, 2048, 512] + - [16, 10955.0] + - - [2048, 1359, 1, 512, 2048, 2048, 2048, 512] + - [22, 10997.0] + - - [2048, 1391, 1, 512, 2048, 2048, 2048, 512] + - [16, 11244.0] + - - [2048, 1424, 1, 512, 2048, 2048, 2048, 512] + - [28, 10803.0] + - - [2048, 1458, 1, 512, 2048, 2048, 2048, 512] + - [15, 11005.0] + - - [2048, 1462, 1, 512, 2048, 2048, 2048, 512] + - [28, 11032.0] + - - [2048, 1467, 1, 512, 2048, 2048, 2048, 512] + - [28, 11083.0] + - - [2048, 1472, 1, 512, 2048, 2048, 2048, 512] + - [0, 11186.0] + - - [2048, 1520, 1, 512, 2048, 2048, 2048, 512] + - [1, 11227.0] + - - [2048, 1596, 1, 512, 2048, 2048, 2048, 512] + - [15, 11361.0] + - - [2048, 1599, 1, 512, 2048, 2048, 2048, 512] + - [15, 11346.0] + - - [2048, 1615, 1, 512, 2048, 2048, 2048, 512] + - [28, 11051.0] + - - [2048, 1680, 1, 512, 2048, 2048, 2048, 512] + - [15, 11040.0] + - - [2048, 1709, 1, 512, 2048, 2048, 2048, 512] + - [28, 11193.0] + - - [2048, 1902, 1, 512, 2048, 2048, 2048, 512] + - [1, 11641.0] + - - [2048, 1917, 1, 512, 2048, 2048, 2048, 512] + - [22, 11770.0] + - - [2048, 2076, 1, 512, 2048, 2048, 2048, 512] + - [1, 11184.0] + - - [2048, 2195, 1, 512, 2048, 2048, 2048, 512] + - [15, 11363.0] + - - [2048, 2205, 1, 512, 2048, 2048, 2048, 512] + - [28, 11450.0] + - - [2048, 2418, 1, 512, 2048, 2048, 2048, 512] + - [1, 11460.0] + - - [2048, 2496, 1, 512, 2048, 2048, 2048, 512] + - [1, 11753.0] + - - [2048, 2790, 1, 512, 2048, 2048, 2048, 512] + - [1, 11800.0] + - - [2048, 2864, 1, 512, 2048, 2048, 2048, 512] + - [28, 11589.0] + - - [2048, 3092, 1, 512, 2048, 2048, 2048, 512] + - [1, 11723.0] + - - [2048, 3113, 1, 512, 2048, 2048, 2048, 512] + - [8, 11770.0] + - - [2048, 3137, 1, 512, 2048, 2048, 2048, 512] + - [1, 11926.0] + - - [2048, 3166, 1, 512, 2048, 2048, 2048, 512] + - [1, 11997.0] + - - [2048, 3194, 1, 512, 2048, 2048, 2048, 512] + - [1, 12033.0] + - - [2048, 3219, 1, 512, 2048, 2048, 2048, 512] + - [19, 11619.0] + - - [2048, 3222, 1, 512, 2048, 2048, 2048, 512] + - [1, 11677.0] + - - [2048, 3234, 1, 512, 2048, 2048, 2048, 512] + - [1, 11762.0] + - - [2048, 3237, 1, 512, 2048, 2048, 2048, 512] + - [1, 11747.0] + - - [2048, 3242, 1, 512, 2048, 2048, 2048, 512] + - [1, 11768.0] + - - [2048, 3246, 1, 512, 2048, 2048, 2048, 512] + - [1, 11763.0] + - - [2048, 3249, 1, 512, 2048, 2048, 2048, 512] + - [1, 11771.0] + - - [2048, 3251, 1, 512, 2048, 2048, 2048, 512] + - [1, 11792.0] + - - [2048, 3257, 1, 512, 2048, 2048, 2048, 512] + - [1, 11815.0] + - - [2048, 3262, 1, 512, 2048, 2048, 2048, 512] + - [1, 11818.0] + - - [2048, 3268, 1, 512, 2048, 2048, 2048, 512] + - [1, 11866.0] + - - [2048, 3282, 1, 512, 2048, 2048, 2048, 512] + - [22, 11804.0] + - - [2048, 3286, 1, 512, 2048, 2048, 2048, 512] + - [1, 11883.0] + - - [2048, 3287, 1, 512, 2048, 2048, 2048, 512] + - [1, 11915.0] + - - [2048, 3293, 1, 512, 2048, 2048, 2048, 512] + - [1, 11931.0] + - - [2048, 3297, 1, 512, 2048, 2048, 2048, 512] + - [1, 11945.0] + - - [2048, 3307, 1, 512, 2048, 2048, 2048, 512] + - [1, 11969.0] + - - [2048, 3314, 1, 512, 2048, 2048, 2048, 512] + - [1, 11986.0] + - - [2048, 3315, 1, 512, 2048, 2048, 2048, 512] + - [1, 11984.0] + - - [2048, 3319, 1, 512, 2048, 2048, 2048, 512] + - [1, 11997.0] + - - [2048, 3322, 1, 512, 2048, 2048, 2048, 512] + - [1, 11967.0] + - - [2048, 3323, 1, 512, 2048, 2048, 2048, 512] + - [1, 12008.0] + - - [2048, 3324, 1, 512, 2048, 2048, 2048, 512] + - [1, 12053.0] + - - [2048, 3325, 1, 512, 2048, 2048, 2048, 512] + - [1, 12003.0] + - - [2048, 3327, 1, 512, 2048, 2048, 2048, 512] + - [1, 12020.0] + - - [2048, 3329, 1, 512, 2048, 2048, 2048, 512] + - [1, 11588.0] + - - [2048, 3332, 1, 512, 2048, 2048, 2048, 512] + - [1, 11597.0] + - - [2048, 3336, 1, 512, 2048, 2048, 2048, 512] + - [1, 11613.0] + - - [2048, 3339, 1, 512, 2048, 2048, 2048, 512] + - [1, 11660.0] + - - [2048, 3342, 1, 512, 2048, 2048, 2048, 512] + - [1, 11635.0] + - - [2048, 3344, 1, 512, 2048, 2048, 2048, 512] + - [1, 11645.0] + - - [2048, 3358, 1, 512, 2048, 2048, 2048, 512] + - [1, 11690.0] + - - [2048, 3360, 1, 512, 2048, 2048, 2048, 512] + - [1, 11682.0] + - - [2048, 3364, 1, 512, 2048, 2048, 2048, 512] + - [1, 11703.0] + - - [2048, 3365, 1, 512, 2048, 2048, 2048, 512] + - [1, 11740.0] + - - [2048, 3369, 1, 512, 2048, 2048, 2048, 512] + - [1, 11732.0] + - - [2048, 3370, 1, 512, 2048, 2048, 2048, 512] + - [1, 11744.0] + - - [2048, 3371, 1, 512, 2048, 2048, 2048, 512] + - [1, 11745.0] + - - [2048, 3374, 1, 512, 2048, 2048, 2048, 512] + - [1, 11759.0] + - - [2048, 3376, 1, 512, 2048, 2048, 2048, 512] + - [1, 11741.0] + - - [2048, 3377, 1, 512, 2048, 2048, 2048, 512] + - [1, 11768.0] + - - [2048, 3378, 1, 512, 2048, 2048, 2048, 512] + - [1, 11766.0] + - - [2048, 3381, 1, 512, 2048, 2048, 2048, 512] + - [1, 11774.0] + - - [2048, 3382, 1, 512, 2048, 2048, 2048, 512] + - [1, 11798.0] + - - [2048, 3383, 1, 512, 2048, 2048, 2048, 512] + - [1, 11784.0] + - - [2048, 3384, 1, 512, 2048, 2048, 2048, 512] + - [1, 11786.0] + - - [2048, 3385, 1, 512, 2048, 2048, 2048, 512] + - [1, 11794.0] + - - [2048, 3386, 1, 512, 2048, 2048, 2048, 512] + - [1, 11792.0] + - - [2048, 3388, 1, 512, 2048, 2048, 2048, 512] + - [1, 11796.0] + - - [2048, 3390, 1, 512, 2048, 2048, 2048, 512] + - [1, 11771.0] + - - [2048, 3391, 1, 512, 2048, 2048, 2048, 512] + - [1, 11804.0] + - - [2048, 3396, 1, 512, 2048, 2048, 2048, 512] + - [1, 11819.0] + - - [2048, 3399, 1, 512, 2048, 2048, 2048, 512] + - [1, 11807.0] + - - [2048, 3402, 1, 512, 2048, 2048, 2048, 512] + - [1, 11865.0] + - - [2048, 3410, 1, 512, 2048, 2048, 2048, 512] + - [1, 11839.0] + - - [2048, 3412, 1, 512, 2048, 2048, 2048, 512] + - [1, 11878.0] + - - [2048, 3414, 1, 512, 2048, 2048, 2048, 512] + - [1, 11902.0] + - - [2048, 3415, 1, 512, 2048, 2048, 2048, 512] + - [1, 11858.0] + - - [2048, 3418, 1, 512, 2048, 2048, 2048, 512] + - [1, 11876.0] + - - [2048, 3420, 1, 512, 2048, 2048, 2048, 512] + - [1, 11916.0] + - - [2048, 3422, 1, 512, 2048, 2048, 2048, 512] + - [1, 11897.0] + - - [2048, 3425, 1, 512, 2048, 2048, 2048, 512] + - [1, 11894.0] + - - [2048, 3426, 1, 512, 2048, 2048, 2048, 512] + - [1, 11888.0] + - - [2048, 3427, 1, 512, 2048, 2048, 2048, 512] + - [1, 11930.0] + - - [2048, 3428, 1, 512, 2048, 2048, 2048, 512] + - [1, 11921.0] + - - [2048, 3430, 1, 512, 2048, 2048, 2048, 512] + - [1, 11943.0] + - - [2048, 3431, 1, 512, 2048, 2048, 2048, 512] + - [1, 11895.0] + - - [2048, 3432, 1, 512, 2048, 2048, 2048, 512] + - [1, 11953.0] + - - [2048, 3433, 1, 512, 2048, 2048, 2048, 512] + - [1, 11934.0] + - - [2048, 3438, 1, 512, 2048, 2048, 2048, 512] + - [1, 11930.0] + - - [2048, 3439, 1, 512, 2048, 2048, 2048, 512] + - [1, 11935.0] + - - [2048, 3440, 1, 512, 2048, 2048, 2048, 512] + - [1, 11939.0] + - - [2048, 3443, 1, 512, 2048, 2048, 2048, 512] + - [1, 11969.0] + - - [2048, 3445, 1, 512, 2048, 2048, 2048, 512] + - [1, 11961.0] + - - [2048, 3447, 1, 512, 2048, 2048, 2048, 512] + - [1, 11944.0] + - - [2048, 3448, 1, 512, 2048, 2048, 2048, 512] + - [1, 11906.0] + - - [2048, 3450, 1, 512, 2048, 2048, 2048, 512] + - [1, 11938.0] + - - [2048, 3451, 1, 512, 2048, 2048, 2048, 512] + - [1, 11970.0] + - - [2048, 3452, 1, 512, 2048, 2048, 2048, 512] + - [1, 11974.0] + - - [2048, 3453, 1, 512, 2048, 2048, 2048, 512] + - [1, 11990.0] + - - [2048, 3455, 1, 512, 2048, 2048, 2048, 512] + - [1, 11966.0] + - - [2048, 3456, 1, 512, 2048, 2048, 2048, 512] + - [1, 12079.0] + - - [2048, 3457, 1, 512, 2048, 2048, 2048, 512] + - [1, 11560.0] + - - [2048, 3458, 1, 512, 2048, 2048, 2048, 512] + - [1, 11574.0] + - - [2048, 3459, 1, 512, 2048, 2048, 2048, 512] + - [1, 11556.0] + - - [2048, 3460, 1, 512, 2048, 2048, 2048, 512] + - [1, 11586.0] + - - [2048, 3461, 1, 512, 2048, 2048, 2048, 512] + - [1, 11564.0] + - - [2048, 3462, 1, 512, 2048, 2048, 2048, 512] + - [1, 11562.0] + - - [2048, 3466, 1, 512, 2048, 2048, 2048, 512] + - [28, 11564.0] + - - [2048, 3467, 1, 512, 2048, 2048, 2048, 512] + - [1, 11576.0] + - - [2048, 3468, 1, 512, 2048, 2048, 2048, 512] + - [1, 11577.0] + - - [2048, 3470, 1, 512, 2048, 2048, 2048, 512] + - [1, 11606.0] + - - [2048, 3471, 1, 512, 2048, 2048, 2048, 512] + - [1, 11616.0] + - - [2048, 3472, 1, 512, 2048, 2048, 2048, 512] + - [1, 11591.0] + - - [2048, 3475, 1, 512, 2048, 2048, 2048, 512] + - [1, 11616.0] + - - [2048, 3476, 1, 512, 2048, 2048, 2048, 512] + - [22, 11592.0] + - - [2048, 3477, 1, 512, 2048, 2048, 2048, 512] + - [1, 11603.0] + - - [2048, 3478, 1, 512, 2048, 2048, 2048, 512] + - [1, 11607.0] + - - [2048, 3479, 1, 512, 2048, 2048, 2048, 512] + - [1, 11611.0] + - - [2048, 3480, 1, 512, 2048, 2048, 2048, 512] + - [1, 11642.0] + - - [2048, 3481, 1, 512, 2048, 2048, 2048, 512] + - [1, 11627.0] + - - [2048, 3483, 1, 512, 2048, 2048, 2048, 512] + - [15, 11618.0] + - - [2048, 3484, 1, 512, 2048, 2048, 2048, 512] + - [1, 11634.0] + - - [2048, 3487, 1, 512, 2048, 2048, 2048, 512] + - [1, 11635.0] + - - [2048, 3489, 1, 512, 2048, 2048, 2048, 512] + - [1, 11630.0] + - - [2048, 3490, 1, 512, 2048, 2048, 2048, 512] + - [1, 11643.0] + - - [2048, 3491, 1, 512, 2048, 2048, 2048, 512] + - [1, 11648.0] + - - [2048, 3493, 1, 512, 2048, 2048, 2048, 512] + - [1, 11665.0] + - - [2048, 3494, 1, 512, 2048, 2048, 2048, 512] + - [1, 11662.0] + - - [2048, 3495, 1, 512, 2048, 2048, 2048, 512] + - [1, 11685.0] + - - [2048, 3497, 1, 512, 2048, 2048, 2048, 512] + - [1, 11703.0] + - - [2048, 3498, 1, 512, 2048, 2048, 2048, 512] + - [1, 11681.0] + - - [2048, 3501, 1, 512, 2048, 2048, 2048, 512] + - [1, 11710.0] + - - [2048, 3503, 1, 512, 2048, 2048, 2048, 512] + - [1, 11680.0] + - - [2048, 3505, 1, 512, 2048, 2048, 2048, 512] + - [1, 11689.0] + - - [2048, 3507, 1, 512, 2048, 2048, 2048, 512] + - [1, 11708.0] + - - [2048, 3508, 1, 512, 2048, 2048, 2048, 512] + - [1, 11736.0] + - - [2048, 3509, 1, 512, 2048, 2048, 2048, 512] + - [1, 11715.0] + - - [2048, 3510, 1, 512, 2048, 2048, 2048, 512] + - [1, 11740.0] + - - [2048, 3511, 1, 512, 2048, 2048, 2048, 512] + - [1, 11752.0] + - - [2048, 3513, 1, 512, 2048, 2048, 2048, 512] + - [1, 11742.0] + - - [2048, 3514, 1, 512, 2048, 2048, 2048, 512] + - [1, 11774.0] + - - [2048, 3515, 1, 512, 2048, 2048, 2048, 512] + - [1, 11726.0] + - - [2048, 3517, 1, 512, 2048, 2048, 2048, 512] + - [1, 11739.0] + - - [2048, 3518, 1, 512, 2048, 2048, 2048, 512] + - [1, 11756.0] + - - [2048, 3519, 1, 512, 2048, 2048, 2048, 512] + - [1, 11736.0] + - - [2048, 3520, 1, 512, 2048, 2048, 2048, 512] + - [15, 11767.0] + - - [2048, 3523, 1, 512, 2048, 2048, 2048, 512] + - [1, 11763.0] + - - [2048, 3528, 1, 512, 2048, 2048, 2048, 512] + - [1, 11778.0] + - - [2048, 3529, 1, 512, 2048, 2048, 2048, 512] + - [1, 11751.0] + - - [2048, 3530, 1, 512, 2048, 2048, 2048, 512] + - [1, 11767.0] + - - [2048, 3531, 1, 512, 2048, 2048, 2048, 512] + - [1, 11788.0] + - - [2048, 3532, 1, 512, 2048, 2048, 2048, 512] + - [1, 11778.0] + - - [2048, 3533, 1, 512, 2048, 2048, 2048, 512] + - [1, 11827.0] + - - [2048, 3534, 1, 512, 2048, 2048, 2048, 512] + - [1, 11805.0] + - - [2048, 3538, 1, 512, 2048, 2048, 2048, 512] + - [1, 11798.0] + - - [2048, 3539, 1, 512, 2048, 2048, 2048, 512] + - [1, 11800.0] + - - [2048, 3540, 1, 512, 2048, 2048, 2048, 512] + - [1, 11834.0] + - - [2048, 3541, 1, 512, 2048, 2048, 2048, 512] + - [1, 11839.0] + - - [2048, 3547, 1, 512, 2048, 2048, 2048, 512] + - [1, 11822.0] + - - [2048, 3548, 1, 512, 2048, 2048, 2048, 512] + - [1, 11836.0] + - - [2048, 3552, 1, 512, 2048, 2048, 2048, 512] + - [1, 11849.0] + - - [2048, 3564, 1, 512, 2048, 2048, 2048, 512] + - [1, 11896.0] + - - [2048, 3575, 1, 512, 2048, 2048, 2048, 512] + - [1, 11911.0] + - - [2048, 3598, 1, 512, 2048, 2048, 2048, 512] + - [1, 11524.0] + - - [2048, 3599, 1, 512, 2048, 2048, 2048, 512] + - [1, 11554.0] + - - [2048, 3608, 1, 512, 2048, 2048, 2048, 512] + - [1, 11574.0] + - - [2048, 3776, 1, 512, 2048, 2048, 2048, 512] + - [1, 12040.0] + - - [2048, 3780, 1, 512, 2048, 2048, 2048, 512] + - [1, 12060.0] + - - [2048, 3796, 1, 512, 2048, 2048, 2048, 512] + - [1, 12112.0] + - - [2048, 3822, 1, 512, 2048, 2048, 2048, 512] + - [1, 12114.0] + - - [2048, 3835, 1, 512, 2048, 2048, 2048, 512] + - [1, 12134.0] + - - [2048, 3840, 1, 512, 2048, 2048, 2048, 512] + - [1, 12282.0] + - - [2048, 3859, 1, 512, 2048, 2048, 2048, 512] + - [1, 11876.0] + - - [2048, 3864, 1, 512, 2048, 2048, 2048, 512] + - [1, 11884.0] + - - [2048, 3870, 1, 512, 2048, 2048, 2048, 512] + - [1, 11872.0] + - - [2048, 3876, 1, 512, 2048, 2048, 2048, 512] + - [1, 11880.0] + - - [2048, 3906, 1, 512, 2048, 2048, 2048, 512] + - [1, 11966.0] + - - [2048, 3910, 1, 512, 2048, 2048, 2048, 512] + - [1, 11956.0] + - - [2048, 3925, 1, 512, 2048, 2048, 2048, 512] + - [1, 12003.0] + - - [2048, 3942, 1, 512, 2048, 2048, 2048, 512] + - [1, 12061.0] + - - [2048, 3944, 1, 512, 2048, 2048, 2048, 512] + - [1, 12061.0] + - - [2048, 3955, 1, 512, 2048, 2048, 2048, 512] + - [1, 12086.0] + - - [2048, 3968, 1, 512, 2048, 2048, 2048, 512] + - [1, 12237.0] + - - [2048, 3969, 1, 512, 2048, 2048, 2048, 512] + - [1, 11724.0] + - - [2048, 3976, 1, 512, 2048, 2048, 2048, 512] + - [1, 11792.0] + - - [2048, 3977, 1, 512, 2048, 2048, 2048, 512] + - [1, 11777.0] + - - [2048, 3978, 1, 512, 2048, 2048, 2048, 512] + - [1, 11763.0] + - - [2048, 3990, 1, 512, 2048, 2048, 2048, 512] + - [1, 11793.0] + - - [2048, 3995, 1, 512, 2048, 2048, 2048, 512] + - [1, 11840.0] + - - [2048, 3996, 1, 512, 2048, 2048, 2048, 512] + - [1, 11801.0] + - - [2048, 3999, 1, 512, 2048, 2048, 2048, 512] + - [1, 11833.0] + - - [2048, 4005, 1, 512, 2048, 2048, 2048, 512] + - [1, 11849.0] + - - [2048, 4012, 1, 512, 2048, 2048, 2048, 512] + - [1, 11881.0] + - - [2048, 4020, 1, 512, 2048, 2048, 2048, 512] + - [1, 11885.0] + - - [2048, 4026, 1, 512, 2048, 2048, 2048, 512] + - [1, 11904.0] + - - [2048, 4030, 1, 512, 2048, 2048, 2048, 512] + - [1, 11913.0] + - - [2048, 4032, 1, 512, 2048, 2048, 2048, 512] + - [1, 11923.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 12179.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12149.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 12330.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11657.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11828.0] + - - [1024, 3968, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 11834.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12066.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12205.0] + - - [1024, 7200, 1, 42720, 1024, 1024, 1024, 42720] + - [8, 12266.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12203.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12266.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12291.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12401.0] + - - [1024, 9520, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 12478.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12394.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12455.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12578.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12496.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12365.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12509.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12527.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12587.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 1024, 4096] + - [21, 11406.0] + - - [1024, 2048, 1, 30528, 1024, 1024, 1024, 30528] + - [28, 11412.0] + - - [1024, 4096, 1, 30528, 1024, 1024, 1024, 30528] + - [1, 12198.0] + - - [1024, 10240, 1, 256, 1024, 1024, 1024, 256] + - [1, 12078.0] + - - [1024, 10496, 1, 256, 1024, 1024, 1024, 256] + - [1, 12115.0] + - - [1024, 11008, 1, 256, 1024, 1024, 1024, 256] + - [1, 12000.0] + - - [1024, 11264, 1, 256, 1024, 1024, 1024, 256] + - [1, 11988.0] + - - [1024, 11520, 1, 256, 1024, 1024, 1024, 256] + - [1, 12194.0] + - - [1024, 12288, 1, 256, 1024, 1024, 1024, 256] + - [1, 12060.0] + - - [1024, 13312, 1, 256, 1024, 1024, 1024, 256] + - [1, 12122.0] + - - [1024, 13568, 1, 256, 1024, 1024, 1024, 256] + - [1, 12109.0] + - - [1024, 14336, 1, 256, 1024, 1024, 1024, 256] + - [1, 12204.0] + - - [1024, 14592, 1, 256, 1024, 1024, 1024, 256] + - [1, 12150.0] + - - [1024, 14848, 1, 256, 1024, 1024, 1024, 256] + - [1, 12126.0] + - - [1024, 15104, 1, 256, 1024, 1024, 1024, 256] + - [1, 12128.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [21, 11228.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1] + - [21, 238.0] + - - [1024, 16128, 1, 256, 1024, 1024, 1024, 256] + - [1, 12183.0] + - - [1024, 17152, 1, 256, 1024, 1024, 1024, 256] + - [1, 12173.0] + - - [1024, 1792, 1, 256, 1024, 1024, 1024, 256] + - [1, 10412.0] + - - [1024, 18944, 1, 256, 1024, 1024, 1024, 256] + - [1, 12135.0] + - - [1024, 19712, 1, 256, 1024, 1024, 1024, 256] + - [1, 12156.0] + - - [1024, 19968, 1, 256, 1024, 1024, 1024, 256] + - [4, 12123.0] + - - [1024, 20480, 1, 256, 1024, 1024, 1024, 256] + - [4, 12217.0] + - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] + - [28, 10833.0] + - - [1024, 20992, 1, 256, 1024, 1024, 1024, 256] + - [1, 12160.0] + - - [1024, 21504, 1, 256, 1024, 1024, 1024, 256] + - [4, 12124.0] + - - [1024, 22016, 1, 256, 1024, 1024, 1024, 256] + - [4, 12210.0] + - - [1024, 23552, 1, 256, 1024, 1024, 1024, 256] + - [4, 12194.0] + - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] + - [1, 11166.0] + - - [1024, 28672, 1, 256, 1024, 1024, 1024, 256] + - [4, 12252.0] + - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] + - [1, 10986.0] + - - [1024, 3328, 1, 256, 1024, 1024, 1024, 256] + - [0, 11257.0] + - - [1024, 33536, 1, 256, 1024, 1024, 1024, 256] + - [4, 12298.0] + - - [1024, 3840, 1, 256, 1024, 1024, 1024, 256] + - [1, 11547.0] + - - [1024, 40448, 1, 256, 1024, 1024, 1024, 256] + - [4, 12295.0] + - - [1024, 4096, 1, 256, 1024, 1024, 1024, 256] + - [1, 11396.0] + - - [1024, 4608, 1, 256, 1024, 1024, 1024, 256] + - [15, 11417.0] + - - [1024, 4864, 1, 256, 1024, 1024, 1024, 256] + - [1, 11360.0] + - - [1024, 5120, 1, 256, 1024, 1024, 1024, 256] + - [1, 11842.0] + - - [1024, 5632, 1, 256, 1024, 1024, 1024, 256] + - [1, 11678.0] + - - [1024, 6144, 1, 256, 1024, 1024, 1024, 256] + - [1, 11600.0] + - - [1024, 6400, 1, 256, 1024, 1024, 1024, 256] + - [1, 11996.0] + - - [1024, 7168, 1, 256, 1024, 1024, 1024, 256] + - [1, 11775.0] + - - [1024, 7424, 1, 256, 1024, 1024, 1024, 256] + - [1, 11738.0] + - - [1024, 7680, 1, 256, 1024, 1024, 1024, 256] + - [1, 12037.0] + - - [1024, 7936, 1, 256, 1024, 1024, 1024, 256] + - [1, 11989.0] + - - [1024, 8192, 1, 256, 1024, 1024, 1024, 256] + - [1, 11940.0] + - - [1024, 8448, 1, 256, 1024, 1024, 1024, 256] + - [1, 11849.0] + - - [1024, 8704, 1, 256, 1024, 1024, 1024, 256] + - [1, 11843.0] + - - [1024, 8960, 1, 256, 1024, 1024, 1024, 256] + - [1, 12111.0] + - - [1024, 9728, 1, 256, 1024, 1024, 1024, 256] + - [1, 11977.0] + - - [1024, 9984, 1, 256, 1024, 1024, 1024, 256] + - [1, 11906.0] + - - [2048, 1024, 1, 1, 2048, 2048, 2048, 1] + - [23, 244.0] + - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] + - [0, 10961.0] + - - [256, 8976, 1, 10240, 256, 256, 256, 10240] + - [12, 10769.0] + - - [256, 8976, 1, 10496, 256, 256, 256, 10496] + - [7, 10903.0] + - - [256, 8976, 1, 11008, 256, 256, 256, 11008] + - [7, 10867.0] + - - [256, 8976, 1, 11520, 256, 256, 256, 11520] + - [41, 10824.0] + - - [256, 8976, 1, 12288, 256, 256, 256, 12288] + - [7, 10706.0] + - - [256, 8976, 1, 14336, 256, 256, 256, 14336] + - [7, 10660.0] + - - [256, 8976, 1, 14848, 256, 256, 256, 14848] + - [12, 10835.0] + - - [256, 8976, 1, 15104, 256, 256, 256, 15104] + - [27, 10843.0] + - - [256, 8976, 1, 1536, 256, 256, 256, 1536] + - [35, 10874.0] + - - [256, 8976, 1, 15872, 256, 256, 256, 15872] + - [41, 10777.0] + - - [256, 8976, 1, 17152, 256, 256, 256, 17152] + - [27, 10855.0] + - - [256, 8976, 1, 19712, 256, 256, 256, 19712] + - [12, 10858.0] + - - [256, 8976, 1, 19968, 256, 256, 256, 19968] + - [27, 10831.0] + - - [256, 8976, 1, 20480, 256, 256, 256, 20480] + - [12, 10692.0] + - - [256, 8976, 1, 2048, 256, 256, 256, 2048] + - [35, 10900.0] + - - [256, 8976, 1, 20992, 256, 256, 256, 20992] + - [12, 10850.0] + - - [256, 8976, 1, 22016, 256, 256, 256, 22016] + - [12, 10824.0] + - - [256, 8976, 1, 2304, 256, 256, 256, 2304] + - [35, 10927.0] + - - [256, 8976, 1, 2560, 256, 256, 256, 2560] + - [35, 10905.0] + - - [256, 8976, 1, 26112, 256, 256, 256, 26112] + - [12, 10816.0] + - - [256, 8976, 1, 2816, 256, 256, 256, 2816] + - [35, 10935.0] + - - [256, 8976, 1, 3072, 256, 256, 256, 3072] + - [7, 10889.0] + - - [256, 8976, 1, 33536, 256, 256, 256, 33536] + - [12, 10868.0] + - - [256, 8976, 1, 4352, 256, 256, 256, 4352] + - [41, 10708.0] + - - [256, 8976, 1, 44505, 256, 256, 256, 44505] + - [35, 10979.0] + - - [256, 8976, 1, 4864, 256, 256, 256, 4864] + - [7, 10884.0] + - - [256, 8976, 1, 5376, 256, 256, 256, 5376] + - [21, 10906.0] + - - [256, 8976, 1, 5632, 256, 256, 256, 5632] + - [7, 10888.0] + - - [256, 8976, 1, 5888, 256, 256, 256, 5888] + - [35, 10866.0] + - - [256, 8976, 1, 6144, 256, 256, 256, 6144] + - [21, 10811.0] + - - [256, 8976, 1, 6656, 256, 256, 256, 6656] + - [7, 10913.0] + - - [256, 8976, 1, 7168, 256, 256, 256, 7168] + - [35, 10781.0] + - - [256, 8976, 1, 7424, 256, 256, 256, 7424] + - [21, 10953.0] + - - [256, 8976, 1, 8192, 256, 256, 256, 8192] + - [38, 10578.0] + - - [256, 8976, 1, 8448, 256, 256, 256, 8448] + - [35, 10972.0] + - - [256, 8976, 1, 8960, 256, 256, 256, 8960] + - [35, 10930.0] + - - [256, 8976, 1, 9472, 256, 256, 256, 9472] + - [7, 10895.0] + - - [256, 8976, 1, 9728, 256, 256, 256, 9728] + - [7, 10917.0] + - - [256, 8976, 1, 9984, 256, 256, 256, 9984] + - [21, 10922.0] + - - [3200, 1024, 1, 2048, 3200, 3200, 3200, 2048] + - [1, 12260.0] + - - [4096, 1024, 1, 1, 4096, 4096, 4096, 1] + - [21, 266.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12184.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 11835.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] + - [35, 11414.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 30528, 1024] + - [22, 12591.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 30528, 1024] + - [22, 12548.0] + - - [512, 32768, 1, 256, 512, 512, 512, 256] + - [22, 12088.0] + - - [256, 32768, 1, 128, 256, 256, 256, 128] + - [15, 11288.0] + - - [1024, 32768, 1, 512, 1024, 1024, 1024, 512] + - [22, 12414.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12497.0] + - - [479, 32768, 1, 1024, 479, 479, 479, 1024] + - [11, 11426.0] + - - [289, 128, 64, 768, 289, 289, 289, 768] + - [36, 8466.0] + - - [289, 160, 64, 768, 289, 289, 289, 768] + - [0, 7216.0] + - - [289, 192, 64, 768, 289, 289, 289, 768] + - [3, 8344.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 10534.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [4, 10112.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [11, 10188.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [11, 8847.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [39, 8909.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 64] + - [0, 11004.0] + - - [784, 512, 32, 128, 784, 784, 784, 128] + - [29, 10133.0] + - - [784, 128, 32, 512, 784, 784, 784, 512] + - [35, 9821.0] + - - [196, 1024, 32, 256, 196, 196, 196, 256] + - [32, 8743.0] + - - [256, 6912, 1, 4, 256, 256, 256, 4] + - [3, 1003.0] + - - [512, 4096, 1, 256, 512, 512, 512, 256] + - [15, 10890.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 512] + - [1, 11858.0] + - - [480, 4096, 1, 1024, 480, 480, 480, 1024] + - [21, 10522.0] + - - [512, 6912, 1, 256, 512, 512, 512, 256] + - [1, 11356.0] + - - [1024, 6912, 1, 512, 1024, 1024, 1024, 512] + - [1, 12000.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12204.0] + - - [480, 6912, 1, 1024, 480, 480, 480, 1024] + - [22, 11196.0] + - - [256, 55296, 1, 128, 256, 256, 256, 128] + - [0, 11576.0] + - - [512, 55296, 1, 256, 512, 512, 512, 256] + - [25, 12185.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [8, 12323.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 2880, 3072] + - [1, 12113.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [1, 12625.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [4, 12580.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [1, 12154.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [1, 12472.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [8, 12468.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [11, 12573.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [15, 10200.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [28, 11148.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [16, 11645.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [1, 11947.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [16, 12091.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [16, 12461.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [22, 12569.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [1, 12560.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [1, 12616.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [29, 12475.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [36, 12475.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [29, 12529.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [27, 12440.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [16, 12436.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [16, 12502.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [16, 12306.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [39, 12260.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 384] + - [15, 8623.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 384] + - [28, 10503.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 384] + - [16, 11287.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 384] + - [16, 11565.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 384] + - [16, 11797.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 384] + - [1, 12123.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 384] + - [1, 12214.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 384] + - [1, 12402.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 384] + - [16, 12385.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 384] + - [1, 12460.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 384] + - [16, 12407.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 384] + - [16, 12430.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 384] + - [16, 12479.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 384] + - [1, 12481.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 384] + - [1, 12509.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 384] + - [16, 12549.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 384] + - [16, 12569.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 384] + - [1, 12567.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 384] + - [16, 12578.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 384] + - [16, 12579.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 384] + - [16, 12591.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 384] + - [22, 12568.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 384] + - [16, 12580.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 384] + - [16, 12576.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 384] + - [16, 12598.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 384] + - [16, 12580.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 384] + - [16, 12598.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 384] + - [16, 12582.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 384] + - [16, 12599.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 384] + - [1, 12581.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 384] + - [16, 12604.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 384] + - [16, 12595.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 384] + - [1, 12601.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 384] + - [1, 12590.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 384] + - [16, 12610.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 384] + - [16, 12595.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 384] + - [16, 12613.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 384] + - [22, 12598.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 384] + - [16, 12615.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 384] + - [16, 12595.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 384] + - [16, 12613.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 384] + - [16, 12602.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 384] + - [16, 12616.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 384] + - [1, 12436.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 384] + - [1, 12461.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 384] + - [1, 12451.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 384] + - [16, 12475.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 384] + - [1, 12462.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 384] + - [1, 12613.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 384] + - [1, 12474.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 384] + - [16, 12503.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 384] + - [1, 12486.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 384] + - [1, 12503.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 384] + - [1, 12494.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 384] + - [16, 12502.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 384] + - [1, 12503.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 384] + - [1, 12516.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 384] + - [1, 12506.0] + - - [8192, 1024, 1, 1024, 8192, 8192, 8192, 1024] + - [1, 12237.0] + - - [8192, 4096, 1, 1024, 8192, 8192, 8192, 1024] + - [1, 12496.0] + - - [16384, 16384, 1, 16384, 16384, 16384, 16384, 16384] + - [12, 11904.0] + - - [1444, 256, 120, 128, 1444, 1444, 1444, 128] + - [4, 10722.0] + - - [1444, 256, 139, 128, 1444, 1444, 1444, 128] + - [0, 10763.0] + - - [1444, 256, 160, 128, 1444, 1444, 1444, 128] + - [0, 10784.0] + - - [1444, 256, 18, 128, 1444, 1444, 1444, 128] + - [28, 10479.0] + - - [1444, 256, 19, 128, 1444, 1444, 1444, 128] + - [15, 10642.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [19, 11463.0] + - - [1444, 256, 139, 256, 1444, 1444, 1444, 256] + - [4, 11493.0] + - - [1444, 256, 160, 256, 1444, 1444, 1444, 256] + - [32, 11493.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [1, 10947.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [1, 11140.0] + - - [361, 256, 120, 512, 361, 361, 361, 512] + - [11, 10979.0] + - - [361, 256, 139, 512, 361, 361, 361, 512] + - [25, 11114.0] + - - [361, 256, 160, 512, 361, 361, 361, 512] + - [39, 11204.0] + - - [361, 256, 18, 512, 361, 361, 361, 512] + - [35, 10115.0] + - - [361, 256, 19, 512, 361, 361, 361, 512] + - [36, 9901.0] + - - [173280, 128, 1, 64, 173280, 173280, 173280, 64] + - [8, 7894.0] + - - [200716, 128, 1, 64, 200716, 200716, 200716, 64] + - [31, 6645.0] + - - [231040, 128, 1, 64, 231040, 231040, 231040, 64] + - [18, 7949.0] + - - [25992, 128, 1, 64, 25992, 25992, 25992, 64] + - [15, 8510.0] + - - [27436, 128, 1, 64, 27436, 27436, 27436, 64] + - [15, 9452.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [4, 12249.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 12575.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [8, 12201.0] + - - [1024, 1280, 1, 2, 1024, 1024, 1024, 2] + - [3, 314.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 11871.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12206.0] + - - [1024, 4992, 1, 2, 1024, 1024, 1024, 2] + - [2, 366.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11992.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12358.0] + - - [1024, 5120, 1, 2, 1024, 1024, 1024, 2] + - [9, 367.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12136.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 12258.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 12422.0] + - - [1024, 5248, 1, 2, 1024, 1024, 1024, 2] + - [14, 351.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11742.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11959.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12544.0] + - - [1024, 2560, 1, 2, 1024, 1024, 1024, 2] + - [0, 488.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 12223.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12434.0] + - - [1024, 1152, 1, 2, 1024, 1024, 1024, 2] + - [35, 324.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 1024, 4096] + - [6, 10551.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11751.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12181.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 12166.0] + - - [1024, 8192, 1, 33712, 1024, 1024, 1024, 33712] + - [1, 12234.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12347.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 12411.0] + - - [1024, 9600, 1, 33712, 1024, 1024, 1024, 33712] + - [16, 12462.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 12445.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 12499.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12176.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 12242.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 1024, 4096] + - [36, 12312.0] + - - [1024, 10080, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 12194.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 11898.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11967.0] + - - [1024, 6528, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 12034.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11818.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11944.0] + - - [1024, 7104, 1, 42720, 1024, 1024, 1024, 42720] + - [16, 11976.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12011.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 12039.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12273.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 12332.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12460.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12478.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12495.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12402.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12548.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12504.0] + - - [480, 32768, 1, 1024, 480, 480, 480, 1024] + - [36, 11486.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [7, 11464.0] + - - [2048, 1024, 1, 30592, 2048, 2048, 2048, 30592] + - [18, 11082.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 2048, 6144] + - [7, 11396.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 2048, 8192] + - [7, 11296.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 8192, 2048] + - [1, 12298.0] + - - [1024, 8192, 1, 30592, 1024, 1024, 1024, 30592] + - [6, 12204.0] + - - [1024, 8192, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 12330.0] + - - [512, 512, 256, 64, 512, 512, 512, 64] + - [1, 9493.0] + - - [1024, 2048, 1, 30592, 1024, 1024, 1024, 30592] + - [4, 11087.0] + - - [1024, 4096, 1, 30592, 1024, 1024, 1024, 30592] + - [4, 11981.0] + - - [512, 512, 128, 64, 512, 512, 512, 64] + - [0, 9555.0] + - - [2560, 2048, 1, 1920, 2560, 2560, 2560, 1920] + - [1, 12453.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [22, 12423.0] + - - [2560, 2048, 1, 7680, 2560, 2560, 2560, 7680] + - [27, 12415.0] + - - [640, 2048, 1, 2560, 640, 640, 640, 2560] + - [12, 11635.0] + - - [512, 512, 40, 64, 512, 512, 512, 64] + - [7, 10110.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [22, 11983.0] + - - [1536, 4096, 1, 4608, 1536, 1536, 1536, 4608] + - [27, 11953.0] + - - [1536, 4096, 1, 50304, 1536, 1536, 1536, 50304] + - [20, 11925.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 1536, 6144] + - [41, 11932.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 6144, 1536] + - [1, 12595.0] + - - [1024, 1024, 64, 96, 1024, 1024, 1024, 96] + - [11, 11499.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [22, 12407.0] + - - [1536, 8192, 1, 4608, 1536, 1536, 1536, 4608] + - [1, 12431.0] + - - [1536, 8192, 1, 50304, 1536, 1536, 1536, 50304] + - [11, 12143.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 1536, 6144] + - [36, 12412.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 6144, 1536] + - [1, 12668.0] + - - [1024, 1024, 128, 96, 1024, 1024, 1024, 96] + - [11, 11571.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12366.0] + - - [1024, 16384, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12444.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12452.0] + - - [1024, 16384, 1, 50304, 1024, 1024, 1024, 50304] + - [4, 12306.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 4096, 1024] + - [4, 12524.0] + - - [1024, 1024, 256, 64, 1024, 1024, 1024, 64] + - [10, 8929.0] + - - [1024, 2048, 1, 50304, 1024, 1024, 1024, 50304] + - [4, 11076.0] + - - [1024, 1024, 32, 64, 1024, 1024, 1024, 64] + - [0, 10179.0] + - - [1024, 4096, 1, 50304, 1024, 1024, 1024, 50304] + - [4, 12029.0] + - - [1024, 1024, 64, 64, 1024, 1024, 1024, 64] + - [10, 8922.0] + - - [1024, 8192, 1, 50304, 1024, 1024, 1024, 50304] + - [20, 12124.0] + - - [1024, 1024, 128, 64, 1024, 1024, 1024, 64] + - [24, 8925.0] + - - [128, 128, 1024, 64, 128, 128, 128, 64] + - [3, 5527.0] + - - [1024, 8192, 1, 30528, 1024, 1024, 1024, 30528] + - [1, 12205.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [4, 11649.0] + - - [1024, 3456, 1, 512, 1024, 1024, 1024, 512] + - [4, 11597.0] + - - [256, 6912, 1, 128, 256, 256, 256, 128] + - [0, 9796.0] + - - [480, 3456, 1, 1024, 480, 480, 480, 1024] + - [21, 10358.0] + - - [512, 3456, 1, 256, 512, 512, 512, 256] + - [0, 9899.0] + - - [1024, 1280, 1, 30528, 1024, 1024, 1024, 30528] + - [34, 12107.0] + - - [1024, 1600, 1, 30528, 1024, 1024, 1024, 30528] + - [0, 11475.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12467.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12485.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12646.0] + - - [128, 128, 1280, 64, 128, 128, 128, 64] + - [0, 6865.0] + - - [1024, 1640, 1, 30528, 1024, 1024, 1024, 30528] + - [0, 10751.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12396.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12442.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12587.0] + - - [128, 128, 1312, 64, 128, 128, 128, 64] + - [0, 6875.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12025.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12581.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11908.0] + - - [512, 512, 192, 64, 512, 512, 512, 64] + - [0, 8876.0] + - - [256, 6912, 1, 1, 256, 256, 256, 1] + - [37, 212.0] + - - [3136, 128, 64, 64, 3136, 3136, 3136, 64] + - [3, 7934.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 128] + - [4, 11637.0] + - - [784, 512, 64, 256, 784, 784, 784, 256] + - [4, 10641.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 256] + - [32, 11831.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [4, 12034.0] + - - [196, 1024, 64, 512, 196, 196, 196, 512] + - [11, 9171.0] + - - [784, 256, 64, 512, 784, 784, 784, 512] + - [4, 10658.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [19, 10788.0] + - - [196, 512, 64, 1024, 196, 196, 196, 1024] + - [39, 9155.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [25, 9296.0] + - - [3136, 128, 32, 64, 3136, 3136, 3136, 64] + - [28, 10774.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 128] + - [0, 11334.0] + - - [784, 512, 32, 256, 784, 784, 784, 256] + - [16, 10507.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 256] + - [4, 11435.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [4, 11844.0] + - - [196, 1024, 32, 512, 196, 196, 196, 512] + - [25, 9032.0] + - - [784, 256, 32, 512, 784, 784, 784, 512] + - [36, 10380.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [4, 10663.0] + - - [196, 512, 32, 1024, 196, 196, 196, 1024] + - [11, 8982.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [39, 9177.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12408.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12380.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12405.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12470.0] + - - [1024, 10224, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 12456.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12598.0] + - - [1024, 10240, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12476.0] + - - [1024, 10192, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 12433.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12554.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12423.0] + - - [1024, 10200, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12417.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12341.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12578.0] + - - [1024, 10208, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 12440.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12456.0] + - - [1024, 10224, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 12423.0] + - - [1024, 10240, 1, 2048, 1024, 1024, 1024, 2048] + - [22, 12462.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12285.0] + - - [1024, 10192, 1, 2048, 1024, 1024, 1024, 2048] + - [8, 12390.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12357.0] + - - [1024, 10080, 1, 3072, 1024, 1024, 1024, 3072] + - [22, 12314.0] + - - [100352, 512, 1, 256, 100352, 100352, 100352, 256] + - [32, 12361.0] + - - [12544, 2048, 1, 1024, 12544, 12544, 12544, 1024] + - [1, 12545.0] + - - [200704, 512, 1, 256, 200704, 200704, 200704, 256] + - [32, 12401.0] + - - [25088, 1024, 1, 512, 25088, 25088, 25088, 512] + - [1, 12427.0] + - - [50176, 1024, 1, 512, 50176, 50176, 50176, 512] + - [19, 12451.0] + - - [6272, 2048, 1, 1024, 6272, 6272, 6272, 1024] + - [1, 12325.0] + - - [196, 1024, 128, 256, 196, 196, 196, 256] + - [11, 9033.0] + - - [196, 1024, 256, 256, 196, 196, 196, 256] + - [11, 9129.0] + - - [196, 256, 128, 1024, 196, 196, 196, 1024] + - [39, 9096.0] + - - [196, 256, 256, 1024, 196, 196, 196, 1024] + - [39, 9227.0] + - - [196, 512, 128, 1024, 196, 196, 196, 1024] + - [39, 9257.0] + - - [196, 512, 256, 1024, 196, 196, 196, 1024] + - [11, 9398.0] + - - [3136, 128, 128, 256, 3136, 3136, 3136, 256] + - [32, 11982.0] + - - [3136, 128, 256, 256, 3136, 3136, 3136, 256] + - [32, 12071.0] + - - [784, 256, 128, 512, 784, 784, 784, 512] + - [4, 10774.0] + - - [784, 256, 256, 512, 784, 784, 784, 512] + - [4, 10843.0] + - - [128, 128, 2048, 64, 128, 128, 128, 64] + - [0, 6076.0] + - - [1024, 2560, 1, 30528, 1024, 1024, 1024, 30528] + - [20, 12350.0] + - - [128, 128, 1536, 64, 128, 128, 128, 64] + - [0, 6832.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 12390.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 12299.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12613.0] + - - [1024, 1920, 1, 30528, 1024, 1024, 1024, 30528] + - [8, 11975.0] + - - [128, 128, 192, 64, 128, 128, 128, 64] + - [17, 8626.0] + - - [384, 384, 144, 64, 384, 384, 384, 64] + - [0, 11270.0] + - - [768, 4608, 1, 2, 768, 768, 768, 2] + - [30, 493.0] + - - [3072, 4608, 1, 768, 3072, 3072, 3072, 768] + - [1, 12322.0] + - - [768, 4608, 1, 3072, 768, 768, 768, 3072] + - [8, 12039.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [1, 11791.0] + - - [512, 512, 48, 64, 512, 512, 512, 64] + - [28, 10868.0] + - - [128, 128, 256, 64, 128, 128, 128, 64] + - [33, 9412.0] + - - [384, 384, 192, 64, 384, 384, 384, 64] + - [0, 11369.0] + - - [1024, 4608, 1, 2, 1024, 1024, 1024, 2] + - [20, 488.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12523.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11967.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11693.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 1024] + - [1, 11054.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 1024] + - [1, 10209.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 1024] + - [15, 10082.0] + - - [850, 2048, 2, 512, 850, 850, 850, 512] + - [15, 10384.0] + - - [768, 2048, 2, 512, 768, 768, 768, 512] + - [4, 10922.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 1024] + - [30, 9976.0] + - - [805, 2048, 2, 512, 805, 805, 805, 512] + - [21, 10072.0] + - - [864, 2048, 2, 512, 864, 864, 864, 512] + - [0, 10849.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 1024] + - [1, 10744.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 1024] + - [15, 10598.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 1024] + - [7, 11016.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 512] + - [29, 11539.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 512] + - [1, 11428.0] + - - [888, 2048, 2, 512, 888, 888, 888, 512] + - [15, 11118.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 512] + - [22, 11304.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 512] + - [15, 10961.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 1024] + - [28, 11167.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 1024] + - [4, 10340.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 1024] + - [15, 10226.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 512] + - [19, 11322.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 1024] + - [25, 10428.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 1024] + - [19, 10846.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 1024] + - [28, 10491.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 1024] + - [8, 10398.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 512] + - [28, 10592.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 1024] + - [22, 10613.0] + - - [840, 2048, 2, 512, 840, 840, 840, 512] + - [15, 10316.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 512] + - [28, 11169.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 512] + - [15, 11135.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 1024] + - [15, 10442.0] + - - [713, 2048, 2, 512, 713, 713, 713, 512] + - [0, 9987.0] + - - [13600, 256, 2, 512, 13600, 13600, 13600, 512] + - [4, 11576.0] + - - [12880, 256, 2, 512, 12880, 12880, 12880, 512] + - [1, 11489.0] + - - [12288, 256, 2, 512, 12288, 12288, 12288, 512] + - [1, 11729.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 1024] + - [28, 10891.0] + - - [850, 2048, 1, 512, 850, 850, 850, 512] + - [15, 9545.0] + - - [660, 2048, 2, 512, 660, 660, 660, 512] + - [4, 9336.0] + - - [672, 2048, 2, 512, 672, 672, 672, 512] + - [4, 9497.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 512] + - [28, 11233.0] + - - [726, 2048, 2, 512, 726, 726, 726, 512] + - [4, 10206.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 1024] + - [4, 10325.0] + - - [13824, 256, 2, 512, 13824, 13824, 13824, 512] + - [4, 11944.0] + - - [15200, 256, 2, 512, 15200, 15200, 15200, 512] + - [32, 11866.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 1024] + - [1, 11176.0] + - - [748, 2048, 2, 512, 748, 748, 748, 512] + - [22, 10859.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 1024] + - [8, 10866.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 256] + - [8, 11624.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 512] + - [11, 11511.0] + - - [15200, 128, 1, 512, 15200, 15200, 15200, 512] + - [8, 10861.0] + - - [13600, 128, 1, 512, 13600, 13600, 13600, 512] + - [28, 10585.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 256] + - [16, 10928.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 256] + - [35, 10762.0] + - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] + - [4, 11288.0] + - - [24576, 128, 1, 256, 24576, 24576, 24576, 256] + - [5, 10641.0] + - - [24576, 512, 1, 256, 24576, 24576, 24576, 256] + - [4, 11914.0] + - - [25760, 128, 1, 256, 25760, 25760, 25760, 256] + - [0, 10565.0] + - - [25760, 512, 1, 256, 25760, 25760, 25760, 256] + - [36, 11823.0] + - - [6144, 256, 1, 512, 6144, 6144, 6144, 512] + - [8, 10960.0] + - - [6440, 256, 1, 512, 6440, 6440, 6440, 512] + - [28, 10080.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 256] + - [1, 11265.0] + - - [13600, 512, 1, 128, 13600, 13600, 13600, 128] + - [35, 10995.0] + - - [9408, 512, 2, 128, 9408, 9408, 9408, 128] + - [7, 11229.0] + - - [56000, 256, 2, 64, 56000, 56000, 56000, 64] + - [8, 8556.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 256] + - [8, 11117.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 256] + - [8, 11690.0] + - - [60800, 256, 1, 64, 60800, 60800, 60800, 64] + - [28, 11377.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 256] + - [36, 11663.0] + - - [11776, 512, 2, 128, 11776, 11776, 11776, 128] + - [8, 11731.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 128] + - [0, 11496.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 256] + - [8, 11723.0] + - - [54400, 256, 1, 64, 54400, 54400, 54400, 64] + - [28, 11357.0] + - - [15200, 512, 1, 128, 15200, 15200, 15200, 128] + - [35, 11185.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 256] + - [1, 11841.0] + - - [12672, 512, 2, 128, 12672, 12672, 12672, 128] + - [22, 11823.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 128] + - [1, 11606.0] + - - [46464, 256, 2, 64, 46464, 46464, 46464, 64] + - [0, 11345.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 1024] + - [37, 10572.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 1024] + - [20, 11250.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 256] + - [8, 11151.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 512] + - [28, 11258.0] + - - [45632, 256, 2, 64, 45632, 45632, 45632, 64] + - [15, 10980.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 256] + - [8, 11388.0] + - - [53760, 256, 2, 64, 53760, 53760, 53760, 64] + - [0, 11353.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 1024] + - [30, 10422.0] + - - [47872, 256, 2, 64, 47872, 47872, 47872, 64] + - [15, 10548.0] + - - [47104, 256, 2, 64, 47104, 47104, 47104, 64] + - [15, 10351.0] + - - [50688, 256, 2, 64, 50688, 50688, 50688, 64] + - [0, 9511.0] + - - [45056, 256, 2, 64, 45056, 45056, 45056, 64] + - [15, 10865.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 128] + - [8, 11802.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 256] + - [0, 10924.0] + - - [11264, 512, 2, 128, 11264, 11264, 11264, 128] + - [15, 11434.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 512] + - [7, 11011.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 128] + - [29, 11823.0] + - - [37632, 256, 2, 64, 37632, 37632, 37632, 64] + - [15, 11344.0] + - - [51520, 256, 2, 64, 51520, 51520, 51520, 64] + - [0, 11235.0] + - - [14000, 512, 2, 128, 14000, 14000, 14000, 128] + - [16, 11694.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 128] + - [36, 11310.0] + - - [64512, 256, 2, 64, 64512, 64512, 64512, 64] + - [0, 11201.0] + - - [54400, 256, 2, 64, 54400, 54400, 54400, 64] + - [8, 8742.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 256] + - [1, 11566.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 128] + - [8, 11672.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 256] + - [1, 11817.0] + - - [950, 2048, 1, 512, 950, 950, 950, 512] + - [15, 10327.0] + - - [55296, 256, 2, 256, 55296, 55296, 55296, 256] + - [19, 12207.0] + - - [51520, 256, 2, 256, 51520, 51520, 51520, 256] + - [4, 12088.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 512] + - [16, 11388.0] + - - [60800, 256, 2, 256, 60800, 60800, 60800, 256] + - [32, 12312.0] + - - [54400, 256, 2, 256, 54400, 54400, 54400, 256] + - [19, 12299.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 256] + - [36, 11454.0] + - - [60800, 256, 2, 64, 60800, 60800, 60800, 64] + - [0, 11070.0] + - - [3800, 1024, 1, 256, 3800, 3800, 3800, 256] + - [1, 11287.0] + - - [3400, 1024, 1, 256, 3400, 3400, 3400, 256] + - [1, 11147.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 256] + - [4, 11447.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 256] + - [36, 11264.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 128] + - [21, 11450.0] + - - [49152, 256, 2, 256, 49152, 49152, 49152, 256] + - [25, 12023.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 128] + - [28, 11418.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 128] + - [29, 11638.0] + - - [42240, 256, 2, 64, 42240, 42240, 42240, 64] + - [15, 11342.0] + - - [1008, 2048, 2, 512, 1008, 1008, 1008, 512] + - [1, 11567.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 256] + - [1, 11401.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 128] + - [22, 11732.0] + - - [56832, 256, 2, 64, 56832, 56832, 56832, 64] + - [0, 9429.0] + - - [43008, 256, 2, 64, 43008, 43008, 43008, 64] + - [15, 11181.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 128] + - [1, 11712.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 256] + - [1, 11369.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 256] + - [8, 11542.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 128] + - [22, 11763.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 1024] + - [1, 11433.0] + - - [55296, 256, 2, 64, 55296, 55296, 55296, 64] + - [0, 10276.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 1024] + - [15, 10194.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 128] + - [36, 11662.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 256] + - [8, 11657.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 256] + - [1, 11379.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 256] + - [1, 11910.0] + - - [49152, 256, 2, 64, 49152, 49152, 49152, 64] + - [0, 9654.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 256] + - [8, 11480.0] + - - [950, 2048, 2, 512, 950, 950, 950, 512] + - [22, 10967.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 256] + - [16, 11780.0] + - - [1610, 2048, 1, 1024, 1610, 1610, 1610, 1024] + - [15, 11091.0] + - - [6912, 256, 1, 512, 6912, 6912, 6912, 512] + - [15, 11013.0] + - - [6800, 256, 1, 512, 6800, 6800, 6800, 512] + - [15, 10636.0] + - - [27648, 128, 1, 256, 27648, 27648, 27648, 256] + - [16, 11339.0] + - - [27200, 128, 1, 256, 27200, 27200, 27200, 256] + - [0, 10947.0] + - - [30400, 128, 1, 256, 30400, 30400, 30400, 256] + - [22, 11123.0] + - - [7600, 256, 1, 512, 7600, 7600, 7600, 512] + - [22, 10944.0] + - - [6144, 1024, 1, 512, 6144, 6144, 6144, 512] + - [1, 11770.0] + - - [6912, 1024, 1, 512, 6912, 6912, 6912, 512] + - [1, 12056.0] + - - [6440, 1024, 1, 512, 6440, 6440, 6440, 512] + - [36, 11658.0] + - - [27648, 512, 1, 256, 27648, 27648, 27648, 256] + - [8, 11976.0] + - - [1728, 2048, 1, 1024, 1728, 1728, 1728, 1024] + - [1, 10981.0] + - - [27200, 512, 1, 256, 27200, 27200, 27200, 256] + - [8, 11899.0] + - - [6800, 1024, 1, 512, 6800, 6800, 6800, 512] + - [36, 11719.0] + - - [1700, 2048, 1, 1024, 1700, 1700, 1700, 1024] + - [1, 10772.0] + - - [7600, 1024, 1, 512, 7600, 7600, 7600, 512] + - [1, 12161.0] + - - [30400, 512, 1, 256, 30400, 30400, 30400, 256] + - [16, 12035.0] + - - [1900, 2048, 1, 1024, 1900, 1900, 1900, 1024] + - [8, 12000.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [1, 12307.0] + - - [1024, 1024, 160, 96, 1024, 1024, 1024, 96] + - [11, 11558.0] + - - [1920, 16384, 1, 25216, 1920, 1920, 1920, 25216] + - [16, 12461.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 3840, 1920] + - [1, 12643.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 1920, 3840] + - [25, 11891.0] + - - [960, 16384, 1, 1920, 960, 960, 960, 1920] + - [22, 11618.0] + - - [1920, 16384, 1, 2880, 1920, 1920, 1920, 2880] + - [16, 12688.0] + - - [1024, 1024, 40, 96, 1024, 1024, 1024, 96] + - [0, 11554.0] + - - [1920, 4096, 1, 25216, 1920, 1920, 1920, 25216] + - [16, 12376.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 3840, 1920] + - [16, 12601.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 1920, 3840] + - [19, 12399.0] + - - [960, 4096, 1, 1920, 960, 960, 960, 1920] + - [16, 11383.0] + - - [1920, 4096, 1, 2880, 1920, 1920, 1920, 2880] + - [16, 12578.0] + - - [1024, 1024, 80, 96, 1024, 1024, 1024, 96] + - [11, 11532.0] + - - [1920, 8192, 1, 25216, 1920, 1920, 1920, 25216] + - [39, 12258.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 3840, 1920] + - [1, 12652.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 1920, 3840] + - [19, 12494.0] + - - [960, 8192, 1, 1920, 960, 960, 960, 1920] + - [16, 11558.0] + - - [1920, 8192, 1, 2880, 1920, 1920, 1920, 2880] + - [16, 12650.0] + - - [1024, 1024, 96, 96, 1024, 1024, 1024, 96] + - [11, 11530.0] + - - [2304, 16384, 1, 12672, 2304, 2304, 2304, 12672] + - [16, 12333.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [8, 12582.0] + - - [576, 16384, 1, 2304, 576, 576, 576, 2304] + - [36, 11165.0] + - - [2304, 16384, 1, 1728, 2304, 2304, 2304, 1728] + - [16, 12618.0] + - - [1024, 1024, 24, 96, 1024, 1024, 1024, 96] + - [15, 11525.0] + - - [2304, 4096, 1, 12672, 2304, 2304, 2304, 12672] + - [16, 12316.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [1, 12466.0] + - - [576, 4096, 1, 2304, 576, 576, 576, 2304] + - [36, 10918.0] + - - [2304, 4096, 1, 1728, 2304, 2304, 2304, 1728] + - [29, 12496.0] + - - [1024, 1024, 48, 96, 1024, 1024, 1024, 96] + - [11, 11489.0] + - - [2304, 8192, 1, 12672, 2304, 2304, 2304, 12672] + - [16, 12364.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [22, 12504.0] + - - [576, 8192, 1, 2304, 576, 576, 576, 2304] + - [36, 11081.0] + - - [2304, 8192, 1, 1728, 2304, 2304, 2304, 1728] + - [29, 12588.0] + - - [1024, 1024, 16, 96, 1024, 1024, 1024, 96] + - [15, 11529.0] + - - [3072, 4096, 1, 6400, 3072, 3072, 3072, 6400] + - [1, 12448.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 1536, 3072] + - [22, 12010.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 3072, 1536] + - [1, 12349.0] + - - [384, 4096, 1, 3072, 384, 384, 384, 3072] + - [8, 11264.0] + - - [3072, 4096, 1, 1152, 3072, 3072, 3072, 1152] + - [1, 12323.0] + - - [1024, 1024, 32, 96, 1024, 1024, 1024, 96] + - [11, 11311.0] + - - [3072, 8192, 1, 6400, 3072, 3072, 3072, 6400] + - [1, 12597.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 1536, 3072] + - [25, 12286.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 3072, 1536] + - [1, 12479.0] + - - [384, 8192, 1, 3072, 384, 384, 384, 3072] + - [41, 11694.0] + - - [3072, 8192, 1, 1152, 3072, 3072, 3072, 1152] + - [1, 12563.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [8, 12319.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [11, 12223.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 4096, 2048] + - [8, 12361.0] + - - [1024, 2283, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 11103.0] + - - [1024, 2296, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 11147.0] + - - [1024, 2306, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11155.0] + - - [1024, 2309, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11188.0] + - - [1024, 2318, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11208.0] + - - [1024, 2320, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 11216.0] + - - [1024, 2324, 1, 29000, 1024, 1024, 1024, 29000] + - [7, 11256.0] + - - [1024, 2325, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11272.0] + - - [1024, 2329, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11285.0] + - - [1024, 2338, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11324.0] + - - [1024, 2345, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11348.0] + - - [1024, 2350, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11365.0] + - - [1024, 2362, 1, 29000, 1024, 1024, 1024, 29000] + - [28, 11426.0] + - - [1024, 2366, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11444.0] + - - [1024, 2368, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11450.0] + - - [1024, 2374, 1, 29000, 1024, 1024, 1024, 29000] + - [20, 11463.0] + - - [1024, 2390, 1, 29000, 1024, 1024, 1024, 29000] + - [6, 11538.0] + - - [512, 512, 320, 64, 512, 512, 512, 64] + - [0, 9258.0] + - - [512, 512, 80, 64, 512, 512, 512, 64] + - [15, 11060.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [1, 12209.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 2560, 4096] + - [6, 12262.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 2560] + - [8, 12195.0] + - - [1024, 1024, 512, 64, 1024, 1024, 1024, 64] + - [10, 8918.0] + - - [1024, 32768, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12565.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 11635.0] + - - [1024, 32768, 1, 50304, 1024, 1024, 1024, 50304] + - [12, 12013.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 4096, 1024] + - [11, 12043.0] + - - [1024, 1024, 24, 128, 1024, 1024, 1024, 128] + - [11, 11710.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [11, 10990.0] + - - [768, 320, 1, 30522, 768, 768, 768, 30522] + - [42, 10963.0] + - - [768, 640, 1, 30522, 768, 768, 768, 30522] + - [43, 11804.0] + - - [768, 1280, 1, 30522, 768, 768, 768, 30522] + - [43, 12194.0] + - - [1024, 780, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10725.0] + - - [1024, 308, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10767.0] + - - [1024, 800, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10977.0] + - - [1024, 820, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 11225.0] + - - [1024, 385, 1, 30522, 1024, 1024, 1024, 30522] + - [45, 9220.0] + - - [1024, 462, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10127.0] + - - [1024, 640, 1, 30528, 1024, 1024, 1024, 30528] + - [43, 11969.0] + - - [2048, 199, 1, 29000, 2048, 2048, 2048, 29000] + - [44, 8704.0] + - - [2048, 221, 1, 29000, 2048, 2048, 2048, 29000] + - [44, 9632.0] + - - [2048, 224, 1, 29000, 2048, 2048, 2048, 29000] + - [44, 9771.0] + - - [2048, 229, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 9987.0] + - - [2048, 234, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 10201.0] + - - [2048, 242, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 10546.0] + - - [2048, 246, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 10715.0] + - - [2048, 247, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 10749.0] + - - [2048, 256, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 11135.0] + - - [2048, 262, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 9366.0] + - - [2048, 264, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 9441.0] + - - [2048, 265, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 9475.0] + - - [2048, 274, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 9783.0] + - - [2048, 277, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 9888.0] + - - [2048, 279, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 9963.0] + - - [2048, 288, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 10279.0] + - - [2048, 296, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 10558.0] + - - [2048, 315, 1, 29000, 2048, 2048, 2048, 29000] + - [45, 11204.0] + - - [2048, 335, 1, 29000, 2048, 2048, 2048, 29000] + - [43, 10202.0] + - - [1024, 561, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10722.0] + - - [1024, 574, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10965.0] + - - [1024, 600, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11254.0] + - - [1024, 608, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11388.0] + - - [1024, 615, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11520.0] + - - [1024, 622, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11651.0] + - - [1024, 625, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11701.0] + - - [1024, 626, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11716.0] + - - [1024, 628, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11763.0] + - - [1024, 636, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11889.0] + - - [1024, 651, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10428.0] + - - [1024, 658, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10533.0] + - - [1024, 669, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10703.0] + - - [1024, 670, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10724.0] + - - [1024, 672, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10765.0] + - - [1024, 684, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10928.0] + - - [1024, 716, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 10886.0] + - - [1024, 730, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11115.0] + - - [1600, 512, 1, 1024, 1600, 1600, 1600, 1024] + - [51, 8987.0] + - - [1024, 512, 1, 1, 1024, 1024, 1024, 1] + - [97, 131.0] + - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] + - [48, 4014.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 1] + - [80, 146.0] + - - [768, 640, 1, 768, 768, 768, 768, 768] + - [50, 9149.0] + - - [768, 1024, 1, 2, 768, 768, 768, 2] + - [82, 260.0] + - - [768, 1024, 1, 768, 768, 768, 768, 768] + - [96, 7995.0] + - - [768, 1280, 1, 768, 768, 768, 768, 768] + - [110, 9107.0] + - - [768, 512, 1, 2, 768, 768, 768, 2] + - [111, 185.0] + - - [768, 512, 1, 768, 768, 768, 768, 768] + - [50, 7494.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [62, 8095.0] + - - [1024, 512, 1, 2, 1024, 1024, 1024, 2] + - [102, 350.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [60, 7284.0] + - - [64, 64, 96, 64, 64, 64, 64, 64] + - [49, 4576.0] + - - [704, 1024, 1, 128, 704, 704, 704, 128] + - [48, 7406.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [73, 10129.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [51, 9589.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [67, 9762.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 128] + - [94, 8230.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [50, 10783.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [96, 9905.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [48, 8144.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [104, 7590.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [104, 9774.0] + - - [128, 5056, 1, 128, 128, 128, 128, 128] + - [61, 6285.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [59, 5488.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [60, 5331.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [104, 7953.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [83, 9644.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [61, 8882.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [50, 4441.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [50, 7884.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [96, 9949.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [96, 9612.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [83, 9719.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [96, 7220.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [96, 7518.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [61, 6315.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [96, 8259.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [96, 9598.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 128] + - [73, 6617.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [55, 8623.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [98, 8344.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [81, 7893.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [63, 8513.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [96, 10038.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 128] + - [54, 6054.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [74, 10337.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [61, 5506.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [72, 8647.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [61, 7664.0] + - - [128, 4288, 1, 128, 128, 128, 128, 128] + - [61, 5371.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 128] + - [73, 7405.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [81, 6517.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [73, 5602.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [83, 9013.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [63, 7174.0] + - - [448, 1856, 1, 128, 448, 448, 448, 128] + - [94, 6365.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [73, 9250.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [61, 6925.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [51, 8204.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [50, 9348.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [51, 10598.0] + - - [704, 1856, 1, 128, 704, 704, 704, 128] + - [94, 7904.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [61, 9570.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 128] + - [81, 8449.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [106, 7888.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [61, 7732.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [55, 8998.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [50, 6788.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [75, 9222.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [50, 7658.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [83, 8635.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [50, 10100.0] + - - [448, 2368, 1, 128, 448, 448, 448, 128] + - [48, 7200.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [73, 8510.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [73, 9216.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [103, 5170.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [83, 9777.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [51, 9947.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [50, 7025.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [48, 8000.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [77, 9398.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [50, 5763.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [73, 9945.0] + - - [448, 1024, 1, 128, 448, 448, 448, 128] + - [81, 5062.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [51, 9143.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 128] + - [114, 4444.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [73, 8360.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [62, 9479.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [94, 7257.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [83, 9538.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [83, 10325.0] + - - [256, 1856, 1, 128, 256, 256, 256, 128] + - [96, 5270.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [50, 10100.0] + - - [448, 1408, 1, 128, 448, 448, 448, 128] + - [81, 6053.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [51, 8107.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [101, 6774.0] + - - [704, 448, 1, 128, 704, 704, 704, 128] + - [72, 5298.0] + - - [704, 1408, 1, 128, 704, 704, 704, 128] + - [83, 6851.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [51, 8882.0] + - - [128, 2944, 1, 128, 128, 128, 128, 128] + - [76, 4261.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [108, 5675.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [48, 8504.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [50, 8854.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [63, 8086.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [50, 9846.0] + - - [256, 2368, 1, 128, 256, 256, 256, 128] + - [59, 7016.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [50, 8375.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [97, 9145.0] + - - [128, 5888, 1, 128, 128, 128, 128, 128] + - [83, 5628.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [50, 8767.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [96, 8634.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [105, 9401.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [73, 9062.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 128] + - [61, 6594.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [48, 7763.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [81, 6127.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [85, 7679.0] + - - [3025, 64, 64, 64, 3025, 3025, 3025, 64] + - [96, 8788.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [51, 9847.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [50, 10125.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [108, 5266.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [48, 7845.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [96, 10322.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [51, 9334.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [55, 11044.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 128] + - [50, 4320.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [87, 7305.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [65, 8992.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [61, 7815.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [50, 10308.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [83, 7927.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [81, 5951.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [83, 9674.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [61, 8949.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 128] + - [61, 6471.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [62, 10613.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 128] + - [59, 7932.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [51, 7228.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 128] + - [73, 6398.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [84, 8706.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [104, 5655.0] + - - [128, 6784, 1, 128, 128, 128, 128, 128] + - [62, 6753.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [73, 7651.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [73, 9629.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [50, 9994.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [83, 9563.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [73, 8271.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [101, 7872.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [63, 8099.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [104, 8799.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [50, 9260.0] + - - [64, 5888, 1, 128, 64, 64, 64, 128] + - [107, 4299.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [97, 10805.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [101, 5523.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [96, 8005.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [62, 10396.0] + - - [704, 704, 1, 128, 704, 704, 704, 128] + - [94, 5027.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [81, 4050.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [50, 6136.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [52, 8643.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [50, 7497.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [62, 9939.0] + - - [256, 3584, 1, 128, 256, 256, 256, 128] + - [83, 8033.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 128] + - [101, 8304.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [62, 9404.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 128] + - [48, 6139.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [83, 7064.0] + - - [256, 2944, 1, 128, 256, 256, 256, 128] + - [50, 6431.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [96, 7583.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [50, 8705.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [50, 10703.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 128] + - [81, 8516.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [48, 6445.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [73, 8477.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [81, 6181.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [50, 7355.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [96, 9905.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [63, 8799.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [62, 9061.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 128] + - [72, 5189.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [83, 5535.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [50, 7490.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [73, 8894.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [104, 7649.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [73, 9697.0] + - - [448, 2944, 1, 128, 448, 448, 448, 128] + - [94, 7646.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [96, 5647.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [73, 10348.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [61, 9864.0] + - - [64, 5056, 1, 128, 64, 64, 64, 128] + - [82, 4142.0] + - - [64, 6784, 1, 128, 64, 64, 64, 128] + - [101, 4548.0] + - - [448, 704, 1, 128, 448, 448, 448, 128] + - [81, 5368.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [73, 7877.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 128] + - [50, 7217.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [51, 10202.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [73, 9506.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [62, 10065.0] + - - [256, 1408, 1, 128, 256, 256, 256, 128] + - [106, 4532.0] + - - [256, 4288, 1, 128, 256, 256, 256, 128] + - [61, 8207.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [96, 6653.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [96, 10190.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [50, 9196.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [50, 9706.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [63, 6904.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 128] + - [61, 6273.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [51, 8022.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [108, 8147.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [97, 10170.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [51, 8982.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [96, 8960.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [105, 10242.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [83, 8169.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [73, 7788.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [104, 7563.0] + - - [128, 2368, 1, 128, 128, 128, 128, 128] + - [103, 5187.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [61, 9607.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [55, 9241.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [96, 9466.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [83, 8856.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [50, 9785.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [96, 6895.0] + - - [128, 3584, 1, 128, 128, 128, 128, 128] + - [83, 5045.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [73, 8186.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [51, 8909.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 128] + - [50, 4417.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [63, 8011.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [110, 9706.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [50, 7797.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [73, 9484.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [83, 8528.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [48, 8217.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [50, 9995.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [61, 9973.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [79, 5390.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [54, 8796.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [47, 5262.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [99, 8757.0] + - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] + - [51, 8691.0] + - - [4096, 256, 1, 2048, 4096, 4096, 4096, 2048] + - [50, 10316.0] + - - [2048, 256, 1, 4096, 2048, 2048, 2048, 4096] + - [51, 8729.0] + - - [512, 768, 1, 2048, 512, 512, 512, 2048] + - [87, 7901.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 1024] + - [84, 8122.0] + - - [2048, 200, 1, 512, 2048, 2048, 2048, 512] + - [73, 5809.0] + - - [4096, 200, 1, 1024, 4096, 4096, 4096, 1024] + - [96, 7721.0] + - - [2048, 200, 1, 4096, 2048, 2048, 2048, 4096] + - [74, 6882.0] + - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] + - [96, 9763.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] + - [50, 9539.0] + - - [2048, 512, 1, 4096, 2048, 2048, 2048, 4096] + - [61, 10250.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 4096] + - [83, 10222.0] + - - [4096, 200, 1, 2048, 4096, 4096, 4096, 2048] + - [96, 7963.0] + - - [2048, 200, 1, 1024, 2048, 2048, 2048, 1024] + - [74, 6390.0] + - - [1024, 768, 1, 512, 1024, 1024, 1024, 512] + - [96, 9390.0] + - - [2048, 200, 1, 2048, 2048, 2048, 2048, 2048] + - [97, 6851.0] + - - [2048, 256, 1, 2048, 2048, 2048, 2048, 2048] + - [97, 8668.0] + - - [512, 768, 1, 512, 512, 512, 512, 512] + - [73, 7649.0] + - - [4096, 256, 1, 4096, 4096, 4096, 4096, 4096] + - [50, 10250.0] + - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] + - [61, 7596.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] + - [61, 9949.0] + - - [4096, 256, 1, 1024, 4096, 4096, 4096, 1024] + - [73, 9851.0] + - - [512, 768, 1, 1024, 512, 512, 512, 1024] + - [83, 7504.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 4096] + - [51, 8677.0] + - - [4096, 200, 1, 4096, 4096, 4096, 4096, 4096] + - [96, 7841.0] + - - [2048, 256, 1, 512, 2048, 2048, 2048, 512] + - [97, 7687.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 9774.0] + - - [4096, 192, 1, 2048, 4096, 4096, 4096, 2048] + - [83, 10146.0] + - - [5329, 64, 64, 160, 5329, 5329, 5329, 160] + - [49, 7509.0] + - - [1225, 64, 64, 384, 1225, 1225, 1225, 384] + - [77, 9200.0] + - - [4096, 320, 1, 1280, 4096, 4096, 4096, 1280] + - [50, 11130.0] + - - [4096, 192, 1, 1280, 4096, 4096, 4096, 1280] + - [50, 9927.0] + - - [1225, 96, 64, 384, 1225, 1225, 1225, 384] + - [55, 8009.0] + - - [4096, 320, 1, 2048, 4096, 4096, 4096, 2048] + - [50, 11234.0] + - - [4096, 256, 1, 1536, 4096, 4096, 4096, 1536] + - [73, 10022.0] + - - [64, 147, 432, 148, 64, 64, 64, 148] + - [72, 7877.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [48, 8397.0] + - - [64, 111, 576, 112, 64, 64, 64, 112] + - [94, 7646.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [72, 5713.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [94, 6585.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [52, 7375.0] + - - [64, 85, 752, 84, 64, 64, 64, 84] + - [48, 6243.0] + - - [64, 122, 528, 123, 64, 64, 64, 123] + - [75, 7464.0] + - - [64, 93, 688, 92, 64, 64, 64, 92] + - [101, 6600.0] + - - [64, 102, 624, 99, 64, 64, 64, 99] + - [72, 6434.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [94, 7118.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [94, 8925.0] + - - [64, 162, 400, 159, 64, 64, 64, 159] + - [94, 8582.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [94, 5997.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [94, 7281.0] + - - [64, 101, 624, 102, 64, 64, 64, 102] + - [94, 7231.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [52, 7194.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [72, 7277.0] + - - [64, 135, 480, 132, 64, 64, 64, 132] + - [94, 7430.0] + - - [64, 134, 480, 132, 64, 64, 64, 132] + - [94, 7385.0] + - - [64, 134, 480, 135, 64, 64, 64, 135] + - [72, 7380.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [48, 8400.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [52, 6468.0] + - - [64, 135, 480, 133, 64, 64, 64, 133] + - [72, 7248.0] + - - [64, 148, 432, 143, 64, 64, 64, 143] + - [72, 7816.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [81, 7484.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [81, 5113.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [52, 7405.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [94, 8169.0] + - - [64, 112, 576, 111, 64, 64, 64, 111] + - [52, 6923.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [94, 7665.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [94, 7336.0] + - - [64, 232, 272, 228, 64, 64, 64, 228] + - [52, 8139.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [94, 7891.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [101, 5529.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [59, 6594.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [94, 7179.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [72, 6634.0] + - - [64, 102, 624, 100, 64, 64, 64, 100] + - [48, 6478.0] + - - [64, 78, 816, 77, 64, 64, 64, 77] + - [59, 6133.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [48, 8377.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [72, 8002.0] + - - [64, 159, 400, 160, 64, 64, 64, 160] + - [94, 8603.0] + - - [64, 102, 624, 101, 64, 64, 64, 101] + - [72, 7591.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [59, 7550.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [72, 8651.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [72, 7092.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [94, 7701.0] + - - [64, 100, 624, 102, 64, 64, 64, 102] + - [72, 7538.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [98, 8075.0] + - - [500, 1024, 1, 512, 500, 500, 500, 512] + - [73, 7692.0] + - - [512, 1024, 1, 512, 512, 512, 512, 512] + - [62, 8095.0] + - - [200, 2048, 1, 512, 200, 200, 200, 512] + - [104, 5822.0] + - - [512, 2000, 1, 1024, 512, 512, 512, 1024] + - [83, 9440.0] + - - [512, 2048, 1, 512, 512, 512, 512, 512] + - [73, 9373.0] + - - [200, 2000, 1, 100, 200, 200, 200, 100] + - [58, 3584.0] + - - [200, 2000, 1, 1024, 200, 200, 200, 1024] + - [83, 6113.0] + - - [500, 1024, 1, 2048, 500, 500, 500, 2048] + - [62, 8186.0] + - - [512, 2048, 1, 100, 512, 512, 512, 100] + - [61, 6696.0] + - - [512, 2048, 1, 2000, 512, 512, 512, 2000] + - [73, 10128.0] + - - [200, 2000, 1, 10, 200, 200, 200, 10] + - [70, 735.0] + - - [500, 2048, 1, 1024, 500, 500, 500, 1024] + - [83, 9393.0] + - - [500, 2000, 1, 10, 500, 500, 500, 10] + - [100, 1205.0] + - - [500, 2048, 1, 100, 500, 500, 500, 100] + - [94, 6132.0] + - - [512, 1024, 1, 500, 512, 512, 512, 500] + - [73, 7550.0] + - - [200, 2000, 1, 2000, 200, 200, 200, 2000] + - [74, 6581.0] + - - [500, 2048, 1, 2000, 500, 500, 500, 2000] + - [50, 9868.0] + - - [512, 2048, 1, 1024, 512, 512, 512, 1024] + - [104, 9693.0] + - - [512, 1024, 1, 100, 512, 512, 512, 100] + - [61, 4864.0] + - - [256, 2000, 1, 10, 256, 256, 256, 10] + - [47, 1255.0] + - - [512, 2000, 1, 100, 512, 512, 512, 100] + - [83, 6360.0] + - - [512, 2000, 1, 2048, 512, 512, 512, 2048] + - [61, 9776.0] + - - [500, 1024, 1, 500, 500, 500, 500, 500] + - [74, 7191.0] + - - [256, 2000, 1, 100, 256, 256, 256, 100] + - [83, 4369.0] + - - [512, 1024, 1, 2048, 512, 512, 512, 2048] + - [62, 8456.0] + - - [500, 2048, 1, 2048, 500, 500, 500, 2048] + - [61, 9722.0] + - - [200, 2048, 1, 10, 200, 200, 200, 10] + - [95, 737.0] + - - [500, 2000, 1, 512, 500, 500, 500, 512] + - [83, 8886.0] + - - [500, 1024, 1, 1024, 500, 500, 500, 1024] + - [61, 7674.0] + - - [200, 2000, 1, 500, 200, 200, 200, 500] + - [51, 5656.0] + - - [256, 2048, 1, 100, 256, 256, 256, 100] + - [73, 4775.0] + - - [500, 2000, 1, 1024, 500, 500, 500, 1024] + - [61, 9215.0] + - - [256, 2048, 1, 1024, 256, 256, 256, 1024] + - [84, 8030.0] + - - [200, 2048, 1, 1024, 200, 200, 200, 1024] + - [108, 6334.0] + - - [512, 2048, 1, 500, 512, 512, 512, 500] + - [73, 9526.0] + - - [512, 2000, 1, 10, 512, 512, 512, 10] + - [100, 1306.0] + - - [500, 1024, 1, 2000, 500, 500, 500, 2000] + - [74, 8388.0] + - - [512, 2000, 1, 512, 512, 512, 512, 512] + - [73, 9457.0] + - - [500, 2000, 1, 2000, 500, 500, 500, 2000] + - [96, 9622.0] + - - [500, 1024, 1, 10, 500, 500, 500, 10] + - [70, 818.0] + - - [256, 2048, 1, 10, 256, 256, 256, 10] + - [82, 895.0] + - - [256, 2048, 1, 500, 256, 256, 256, 500] + - [84, 7546.0] + - - [256, 2048, 1, 2048, 256, 256, 256, 2048] + - [105, 8376.0] + - - [256, 2000, 1, 512, 256, 256, 256, 512] + - [104, 7290.0] + - - [512, 1024, 1, 2000, 512, 512, 512, 2000] + - [74, 8728.0] + - - [256, 2000, 1, 2000, 256, 256, 256, 2000] + - [62, 8400.0] + - - [256, 2048, 1, 2000, 256, 256, 256, 2000] + - [74, 8724.0] + - - [200, 2048, 1, 100, 200, 200, 200, 100] + - [48, 4561.0] + - - [200, 2000, 1, 2048, 200, 200, 200, 2048] + - [84, 6411.0] + - - [500, 2048, 1, 512, 500, 500, 500, 512] + - [83, 9121.0] + - - [500, 2000, 1, 500, 500, 500, 500, 500] + - [73, 8986.0] + - - [200, 2048, 1, 2048, 200, 200, 200, 2048] + - [105, 6554.0] + - - [200, 2048, 1, 500, 200, 200, 200, 500] + - [74, 5821.0] + - - [512, 2000, 1, 500, 512, 512, 512, 500] + - [73, 9239.0] + - - [200, 2048, 1, 2000, 200, 200, 200, 2000] + - [62, 6752.0] + - - [500, 1024, 1, 100, 500, 500, 500, 100] + - [59, 4776.0] + - - [512, 1024, 1, 10, 512, 512, 512, 10] + - [88, 886.0] + - - [512, 1024, 1, 1024, 512, 512, 512, 1024] + - [62, 8037.0] + - - [500, 2048, 1, 10, 500, 500, 500, 10] + - [57, 1196.0] + - - [200, 2000, 1, 512, 200, 200, 200, 512] + - [104, 5623.0] + - - [256, 2000, 1, 500, 256, 256, 256, 500] + - [51, 7244.0] + - - [256, 2048, 1, 512, 256, 256, 256, 512] + - [104, 7498.0] + - - [256, 2000, 1, 2048, 256, 256, 256, 2048] + - [84, 8209.0] + - - [500, 2048, 1, 500, 500, 500, 500, 500] + - [73, 9101.0] + - - [256, 2000, 1, 1024, 256, 256, 256, 1024] + - [83, 7818.0] + - - [500, 2000, 1, 2048, 500, 500, 500, 2048] + - [104, 9537.0] + - - [512, 2000, 1, 2000, 512, 512, 512, 2000] + - [96, 9847.0] + - - [512, 2048, 1, 2048, 512, 512, 512, 2048] + - [83, 9992.0] + - - [512, 2048, 1, 10, 512, 512, 512, 10] + - [94, 1736.0] + - - [500, 2000, 1, 100, 500, 500, 500, 100] + - [74, 6477.0] + - - [1024, 1131, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 9536.0] + - - [1024, 1102, 1, 1024, 1024, 1024, 1024, 1024] + - [62, 9302.0] + - - [1024, 774, 1, 1024, 1024, 1024, 1024, 1024] + - [104, 8522.0] + - - [4096, 128, 1, 2048, 4096, 4096, 4096, 2048] + - [62, 8725.0] + - - [4096, 128, 1, 3072, 4096, 4096, 4096, 3072] + - [51, 8877.0] + - - [1024, 1120, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 9477.0] + - - [1024, 1015, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 9647.0] + - - [1024, 992, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 9486.0] + - - [1024, 950, 1, 1024, 1024, 1024, 1024, 1024] + - [104, 10148.0] + - - [1024, 1088, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 10353.0] + - - [64, 128, 96, 128, 64, 64, 64, 128] + - [114, 3660.0] + - - [768, 1024, 1, 3072, 768, 768, 768, 3072] + - [61, 10192.0] + - - [768, 512, 1, 3072, 768, 768, 768, 3072] + - [54, 8076.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [86, 7252.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [106, 8207.0] + - - [64, 256, 96, 256, 64, 64, 64, 256] + - [63, 8207.0] + - - [6272, 112, 1, 512, 6272, 6272, 6272, 512] + - [96, 7954.0] + - - [2048, 320, 1, 1280, 2048, 2048, 2048, 1280] + - [99, 10017.0] + - - [5329, 64, 1, 448, 5329, 5329, 5329, 448] + - [83, 5335.0] + - - [784, 64, 32, 192, 784, 784, 784, 192] + - [61, 8292.0] + - - [6272, 64, 1, 480, 6272, 6272, 6272, 480] + - [83, 6976.0] + - - [6272, 64, 1, 512, 6272, 6272, 6272, 512] + - [65, 7345.0] + - - [6272, 160, 1, 528, 6272, 6272, 6272, 528] + - [73, 8040.0] + - - [289, 160, 32, 768, 289, 289, 289, 768] + - [83, 6861.0] + - - [5329, 64, 32, 160, 5329, 5329, 5329, 160] + - [50, 7597.0] + - - [5329, 96, 1, 576, 5329, 5329, 5329, 576] + - [73, 6398.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 288] + - [96, 10557.0] + - - [289, 192, 32, 768, 289, 289, 289, 768] + - [104, 8133.0] + - - [2048, 448, 1, 1280, 2048, 2048, 2048, 1280] + - [50, 10196.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [94, 10075.0] + - - [6272, 128, 1, 528, 6272, 6272, 6272, 528] + - [50, 10155.0] + - - [6272, 96, 1, 480, 6272, 6272, 6272, 480] + - [96, 6931.0] + - - [2048, 448, 1, 2048, 2048, 2048, 2048, 2048] + - [73, 10112.0] + - - [784, 96, 32, 192, 784, 784, 784, 192] + - [72, 6881.0] + - - [1001, 512, 1, 4096, 1001, 1001, 1001, 4096] + - [62, 8402.0] + - - [2048, 192, 1, 1280, 2048, 2048, 2048, 1280] + - [104, 7696.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 256] + - [96, 9000.0] + - - [2048, 256, 1, 1536, 2048, 2048, 2048, 1536] + - [97, 8456.0] + - - [6272, 128, 1, 512, 6272, 6272, 6272, 512] + - [77, 9363.0] + - - [1568, 384, 1, 832, 1568, 1568, 1568, 832] + - [96, 9216.0] + - - [1568, 256, 1, 832, 1568, 1568, 1568, 832] + - [59, 7576.0] + - - [1568, 192, 1, 832, 1568, 1568, 1568, 832] + - [50, 7549.0] + - - [289, 192, 32, 1024, 289, 289, 289, 1024] + - [104, 8248.0] + - - [1225, 64, 32, 384, 1225, 1225, 1225, 384] + - [73, 10617.0] + - - [2048, 320, 1, 2048, 2048, 2048, 2048, 2048] + - [54, 10476.0] + - - [2048, 384, 1, 1536, 2048, 2048, 2048, 1536] + - [96, 10038.0] + - - [5041, 96, 1, 576, 5041, 5041, 5041, 576] + - [54, 6665.0] + - - [6272, 192, 1, 480, 6272, 6272, 6272, 480] + - [61, 9570.0] + - - [5041, 192, 1, 720, 5041, 5041, 5041, 720] + - [73, 9829.0] + - - [289, 128, 32, 768, 289, 289, 289, 768] + - [110, 6378.0] + - - [12544, 64, 1, 147, 12544, 12544, 12544, 147] + - [61, 7285.0] + - - [6272, 160, 1, 512, 6272, 6272, 6272, 512] + - [96, 8028.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 192] + - [77, 9082.0] + - - [784, 64, 32, 256, 784, 784, 784, 256] + - [108, 8284.0] + - - [6272, 144, 1, 512, 6272, 6272, 6272, 512] + - [54, 7259.0] + - - [8192, 192, 1, 1280, 8192, 8192, 8192, 1280] + - [73, 10666.0] + - - [8192, 192, 1, 2048, 8192, 8192, 8192, 2048] + - [50, 10918.0] + - - [65, 6400, 1, 1024, 65, 65, 65, 1024] + - [104, 5013.0] + - - [512, 1290, 1, 2048, 512, 512, 512, 2048] + - [83, 8516.0] + - - [512, 2205, 1, 2048, 512, 512, 512, 2048] + - [104, 10696.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [63, 7255.0] + - - [512, 600, 1, 2048, 512, 512, 512, 2048] + - [110, 8278.0] + - - [512, 644, 1, 512, 512, 512, 512, 512] + - [73, 5797.0] + - - [512, 644, 1, 2048, 512, 512, 512, 2048] + - [104, 6777.0] + - - [512, 668, 1, 2048, 512, 512, 512, 2048] + - [104, 6859.0] + - - [512, 714, 1, 512, 512, 512, 512, 512] + - [73, 6345.0] + - - [512, 714, 1, 2048, 512, 512, 512, 2048] + - [73, 7339.0] + - - [512, 720, 1, 512, 512, 512, 512, 512] + - [104, 6442.0] + - - [512, 720, 1, 2048, 512, 512, 512, 2048] + - [96, 7451.0] + - - [512, 722, 1, 2048, 512, 512, 512, 2048] + - [61, 7595.0] + - - [512, 781, 1, 512, 512, 512, 512, 512] + - [104, 6917.0] + - - [512, 781, 1, 2048, 512, 512, 512, 2048] + - [61, 7985.0] + - - [512, 848, 1, 2048, 512, 512, 512, 2048] + - [61, 8663.0] + - - [512, 872, 1, 2048, 512, 512, 512, 2048] + - [61, 8833.0] + - - [512, 936, 1, 512, 512, 512, 512, 512] + - [104, 8045.0] + - - [512, 936, 1, 2048, 512, 512, 512, 2048] + - [61, 9459.0] + - - [512, 980, 1, 512, 512, 512, 512, 512] + - [104, 7148.0] + - - [512, 980, 1, 2048, 512, 512, 512, 2048] + - [62, 8147.0] + - - [512, 1139, 1, 2048, 512, 512, 512, 2048] + - [105, 9214.0] + - - [512, 1184, 1, 2048, 512, 512, 512, 2048] + - [62, 9641.0] + - - [512, 1186, 1, 2048, 512, 512, 512, 2048] + - [62, 9675.0] + - - [512, 1232, 1, 512, 512, 512, 512, 512] + - [62, 8589.0] + - - [512, 1232, 1, 2048, 512, 512, 512, 2048] + - [62, 9999.0] + - - [512, 1279, 1, 2048, 512, 512, 512, 2048] + - [62, 10288.0] + - - [512, 1290, 1, 512, 512, 512, 512, 512] + - [96, 7703.0] + - - [512, 1327, 1, 2048, 512, 512, 512, 2048] + - [61, 8700.0] + - - [512, 1331, 1, 2048, 512, 512, 512, 2048] + - [61, 8837.0] + - - [512, 1341, 1, 2048, 512, 512, 512, 2048] + - [61, 8787.0] + - - [512, 1350, 1, 512, 512, 512, 512, 512] + - [104, 8154.0] + - - [512, 1350, 1, 2048, 512, 512, 512, 2048] + - [61, 8845.0] + - - [512, 1359, 1, 2048, 512, 512, 512, 2048] + - [83, 8951.0] + - - [512, 1391, 1, 2048, 512, 512, 512, 2048] + - [83, 9345.0] + - - [512, 1424, 1, 512, 512, 512, 512, 512] + - [104, 8480.0] + - - [512, 1424, 1, 2048, 512, 512, 512, 2048] + - [83, 9336.0] + - - [512, 1458, 1, 512, 512, 512, 512, 512] + - [104, 8876.0] + - - [512, 1458, 1, 2048, 512, 512, 512, 2048] + - [61, 9740.0] + - - [512, 1462, 1, 512, 512, 512, 512, 512] + - [73, 8559.0] + - - [512, 1462, 1, 2048, 512, 512, 512, 2048] + - [61, 9442.0] + - - [512, 1467, 1, 2048, 512, 512, 512, 2048] + - [104, 9511.0] + - - [512, 1472, 1, 2048, 512, 512, 512, 2048] + - [61, 9628.0] + - - [512, 1520, 1, 512, 512, 512, 512, 512] + - [104, 9060.0] + - - [512, 1520, 1, 2048, 512, 512, 512, 2048] + - [104, 9973.0] + - - [512, 1596, 1, 512, 512, 512, 512, 512] + - [61, 9314.0] + - - [512, 1596, 1, 2048, 512, 512, 512, 2048] + - [61, 10353.0] + - - [512, 1599, 1, 512, 512, 512, 512, 512] + - [104, 9241.0] + - - [512, 1599, 1, 2048, 512, 512, 512, 2048] + - [83, 10317.0] + - - [512, 1615, 1, 512, 512, 512, 512, 512] + - [73, 8842.0] + - - [512, 1615, 1, 2048, 512, 512, 512, 2048] + - [62, 9269.0] + - - [512, 1680, 1, 512, 512, 512, 512, 512] + - [84, 8710.0] + - - [512, 1680, 1, 2048, 512, 512, 512, 2048] + - [62, 9557.0] + - - [512, 1709, 1, 2048, 512, 512, 512, 2048] + - [62, 9734.0] + - - [512, 1890, 1, 512, 512, 512, 512, 512] + - [110, 9565.0] + - - [512, 1902, 1, 2048, 512, 512, 512, 2048] + - [62, 10644.0] + - - [512, 1917, 1, 512, 512, 512, 512, 512] + - [105, 9638.0] + - - [512, 1917, 1, 2048, 512, 512, 512, 2048] + - [84, 10769.0] + - - [512, 2076, 1, 2048, 512, 512, 512, 2048] + - [83, 10032.0] + - - [512, 2195, 1, 2048, 512, 512, 512, 2048] + - [61, 10593.0] + - - [512, 2205, 1, 512, 512, 512, 512, 512] + - [104, 9679.0] + - - [2048, 198, 1, 512, 2048, 2048, 2048, 512] + - [96, 5888.0] + - - [2048, 207, 1, 512, 2048, 2048, 2048, 512] + - [96, 6118.0] + - - [2048, 208, 1, 512, 2048, 2048, 2048, 512] + - [73, 6092.0] + - - [2048, 245, 1, 512, 2048, 2048, 2048, 512] + - [73, 7069.0] + - - [2048, 246, 1, 512, 2048, 2048, 2048, 512] + - [50, 7087.0] + - - [2048, 264, 1, 512, 2048, 2048, 2048, 512] + - [96, 7664.0] + - - [2048, 401, 1, 512, 2048, 2048, 2048, 512] + - [96, 8356.0] + - - [2048, 439, 1, 512, 2048, 2048, 2048, 512] + - [96, 9019.0] + - - [2048, 443, 1, 512, 2048, 2048, 2048, 512] + - [73, 9133.0] + - - [2048, 446, 1, 512, 2048, 2048, 2048, 512] + - [96, 9195.0] + - - [2048, 465, 1, 512, 2048, 2048, 2048, 512] + - [96, 8545.0] + - - [2048, 468, 1, 512, 2048, 2048, 2048, 512] + - [73, 8649.0] + - - [2048, 493, 1, 512, 2048, 2048, 2048, 512] + - [73, 9038.0] + - - [2048, 495, 1, 512, 2048, 2048, 2048, 512] + - [96, 9065.0] + - - [2048, 511, 1, 512, 2048, 2048, 2048, 512] + - [73, 9270.0] + - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] + - [73, 9419.0] + - - [2048, 540, 1, 512, 2048, 2048, 2048, 512] + - [83, 9057.0] + - - [2048, 550, 1, 512, 2048, 2048, 2048, 512] + - [50, 8922.0] + - - [2048, 560, 1, 512, 2048, 2048, 2048, 512] + - [50, 9112.0] + - - [2048, 600, 1, 512, 2048, 2048, 2048, 512] + - [73, 9667.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [103, 5368.0] + - - [64, 65, 496, 64, 64, 64, 64, 64] + - [94, 4116.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [48, 3899.0] + - - [64, 70, 216, 70, 64, 64, 64, 70] + - [60, 3431.0] + - - [64, 71, 216, 71, 64, 64, 64, 71] + - [48, 3303.0] + - - [64, 78, 248, 77, 64, 64, 64, 77] + - [94, 3835.0] + - - [64, 80, 152, 80, 64, 64, 64, 80] + - [72, 3801.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [94, 4823.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [53, 5112.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [78, 5752.0] + - - [64, 122, 264, 123, 64, 64, 64, 123] + - [64, 6254.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [64, 6043.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [85, 7770.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [91, 7153.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [106, 8247.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [85, 8489.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [50, 10190.0] + - - [512, 1600, 1, 32, 512, 512, 512, 32] + - [82, 2945.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [104, 9664.0] + - - [560, 1600, 1, 1024, 560, 560, 560, 1024] + - [61, 8264.0] + - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] + - [51, 8576.0] + - - [64, 192, 64, 1280, 64, 64, 64, 1280] + - [56, 6526.0] + - - [64, 320, 64, 1280, 64, 64, 64, 1280] + - [64, 7366.0] + - - [64, 384, 64, 1280, 64, 64, 64, 1280] + - [86, 7147.0] + - - [64, 448, 64, 1280, 64, 64, 64, 1280] + - [91, 7413.0] + - - [64, 192, 64, 2048, 64, 64, 64, 2048] + - [63, 6438.0] + - - [64, 320, 64, 2048, 64, 64, 64, 2048] + - [69, 6644.0] + - - [64, 384, 64, 2048, 64, 64, 64, 2048] + - [91, 6538.0] + - - [64, 448, 64, 2048, 64, 64, 64, 2048] + - [113, 6577.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 192] + - [50, 10506.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 256] + - [99, 9974.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 288] + - [50, 10511.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 64] + - [94, 5601.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [73, 10276.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [77, 8923.0] + - - [64, 192, 32, 1280, 64, 64, 64, 1280] + - [63, 7320.0] + - - [64, 320, 32, 1280, 64, 64, 64, 1280] + - [63, 9233.0] + - - [64, 384, 32, 1280, 64, 64, 64, 1280] + - [106, 7717.0] + - - [64, 448, 32, 1280, 64, 64, 64, 1280] + - [92, 5015.0] + - - [64, 192, 32, 2048, 64, 64, 64, 2048] + - [85, 7256.0] + - - [64, 320, 32, 2048, 64, 64, 64, 2048] + - [63, 8168.0] + - - [64, 384, 32, 2048, 64, 64, 64, 2048] + - [63, 7346.0] + - - [64, 448, 32, 2048, 64, 64, 64, 2048] + - [112, 6902.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 64] + - [96, 6281.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 256] + - [77, 9611.0] + - - [196, 256, 32, 1024, 196, 196, 196, 1024] + - [104, 8409.0] + - - [256, 4096, 1, 4, 256, 256, 256, 4] + - [70, 797.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [61, 9150.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [90, 9114.0] + - - [768, 768, 1, 384, 768, 768, 768, 384] + - [73, 8159.0] + - - [100, 128, 120, 512, 100, 100, 100, 512] + - [90, 7963.0] + - - [100, 128, 139, 512, 100, 100, 100, 512] + - [104, 8383.0] + - - [100, 128, 160, 512, 100, 100, 100, 512] + - [90, 8343.0] + - - [22500, 64, 1, 147, 22500, 22500, 22500, 147] + - [50, 8467.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [61, 10748.0] + - - [1024, 616, 1, 1024, 1024, 1024, 1024, 1024] + - [110, 9329.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [109, 4537.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [89, 5162.0] + - - [1024, 1024, 1, 2, 1024, 1024, 1024, 2] + - [93, 308.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [85, 7422.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [51, 8947.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [106, 7410.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [51, 9118.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [63, 7393.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [62, 9397.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [64, 7377.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [73, 7611.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [51, 7460.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [115, 3261.0] + - - [1024, 960, 1, 64, 1024, 1024, 1024, 64] + - [104, 5332.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [68, 7338.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [85, 7723.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [90, 8132.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [110, 8262.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [112, 7367.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [91, 6980.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [68, 7166.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [68, 7297.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [68, 5662.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [62, 9714.0] + - - [1024, 864, 1, 512, 1024, 1024, 1024, 512] + - [73, 8879.0] + - - [256, 3456, 1, 128, 256, 256, 256, 128] + - [104, 6982.0] + - - [256, 4096, 1, 128, 256, 256, 256, 128] + - [50, 7342.0] + - - [480, 864, 1, 1024, 480, 480, 480, 1024] + - [83, 7841.0] + - - [512, 864, 1, 256, 512, 512, 512, 256] + - [104, 6236.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [91, 5409.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [68, 5358.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [68, 7294.0] + - - [256, 4096, 1, 1, 256, 256, 256, 1] + - [71, 206.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [91, 5373.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [91, 5359.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [66, 6729.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [63, 7933.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [63, 8182.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [106, 8051.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [66, 7311.0] + - - [950, 512, 2, 2048, 950, 950, 950, 2048] + - [61, 9399.0] + - - [3400, 256, 1, 1024, 3400, 3400, 3400, 1024] + - [51, 9458.0] + - - [3800, 256, 1, 1024, 3800, 3800, 3800, 1024] + - [84, 10515.0] + - - [850, 512, 2, 2048, 850, 850, 850, 2048] + - [105, 9651.0] + - - [805, 512, 2, 2048, 805, 805, 805, 2048] + - [62, 9196.0] + - - [864, 512, 2, 2048, 864, 864, 864, 2048] + - [84, 9776.0] + - - [950, 256, 2, 2048, 950, 950, 950, 2048] + - [75, 8836.0] + - - [888, 512, 2, 2048, 888, 888, 888, 2048] + - [84, 10023.0] + - - [51520, 64, 2, 256, 51520, 51520, 51520, 256] + - [65, 9963.0] + - - [46464, 64, 2, 256, 46464, 46464, 46464, 256] + - [65, 10583.0] + - - [49152, 64, 2, 256, 49152, 49152, 49152, 256] + - [65, 8369.0] + - - [1900, 512, 1, 1024, 1900, 1900, 1900, 1024] + - [51, 10781.0] + - - [1700, 512, 1, 1024, 1700, 1700, 1700, 1024] + - [84, 9284.0] + - - [1610, 512, 1, 1024, 1610, 1610, 1610, 1024] + - [51, 8776.0] + - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] + - [73, 9771.0] + - - [1728, 512, 1, 1024, 1728, 1728, 1728, 1024] + - [84, 9394.0] + - - [1024, 1024, 1, 320, 1024, 1024, 1024, 320] + - [50, 9173.0] + - - [51520, 64, 2, 64, 51520, 51520, 51520, 64] + - [72, 10314.0] + - - [55296, 64, 2, 64, 55296, 55296, 55296, 64] + - [104, 10584.0] + - - [49152, 64, 2, 64, 49152, 49152, 49152, 64] + - [87, 8795.0] + - - [54400, 64, 2, 64, 54400, 54400, 54400, 64] + - [83, 10486.0] + - - [42240, 64, 2, 256, 42240, 42240, 42240, 256] + - [54, 10730.0] + - - [672, 512, 2, 2048, 672, 672, 672, 2048] + - [61, 8946.0] + - - [54400, 64, 2, 256, 54400, 54400, 54400, 256] + - [65, 9951.0] + - - [56832, 64, 2, 256, 56832, 56832, 56832, 256] + - [65, 9391.0] + - - [55296, 64, 2, 256, 55296, 55296, 55296, 256] + - [54, 9482.0] + - - [60800, 64, 2, 64, 60800, 60800, 60800, 64] + - [73, 10842.0] + - - [660, 512, 2, 2048, 660, 660, 660, 2048] + - [61, 8879.0] + - - [768, 512, 2, 2048, 768, 768, 768, 2048] + - [61, 10076.0] + - - [43008, 64, 2, 256, 43008, 43008, 43008, 256] + - [54, 10635.0] + - - [864, 256, 2, 2048, 864, 864, 864, 2048] + - [61, 9083.0] + - - [726, 512, 2, 2048, 726, 726, 726, 2048] + - [61, 9451.0] + - - [768, 256, 2, 2048, 768, 768, 768, 2048] + - [54, 7915.0] + - - [45632, 64, 2, 256, 45632, 45632, 45632, 256] + - [65, 10612.0] + - - [713, 512, 2, 2048, 713, 713, 713, 2048] + - [83, 9402.0] + - - [805, 256, 2, 2048, 805, 805, 805, 2048] + - [61, 8164.0] + - - [60800, 64, 2, 256, 60800, 60800, 60800, 256] + - [54, 9486.0] + - - [850, 256, 2, 2048, 850, 850, 850, 2048] + - [50, 8877.0] + - - [1024, 1024, 1, 81, 1024, 1024, 1024, 81] + - [73, 5625.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [87, 8288.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [110, 8046.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [90, 8208.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [90, 8218.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [90, 8120.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [108, 8051.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [61, 8232.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [90, 8074.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [91, 7386.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [91, 7370.0] + - - [1024, 80, 1, 30522, 1024, 1024, 1024, 30522] + - [121, 5548.0] + - - [1024, 120, 1, 30522, 1024, 1024, 1024, 30522] + - [122, 8261.0] + - - [1024, 77, 1, 30522, 1024, 1024, 1024, 30522] + - [119, 5348.0] + - - [1024, 200, 1, 30522, 1024, 1024, 1024, 30522] + - [116, 7938.0] + - - [1024, 160, 1, 30522, 1024, 1024, 1024, 30522] + - [116, 8571.0] + - - [1024, 180, 1, 30522, 1024, 1024, 1024, 30522] + - [116, 9625.0] + - - [1024, 160, 1, 30528, 1024, 1024, 1024, 30528] + - [118, 8562.0] + - - [1024, 240, 1, 30528, 1024, 1024, 1024, 30528] + - [118, 9483.0] + - - [2560, 109, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 9590.0] + - - [2560, 121, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 10610.0] + - - [2560, 65, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 5785.0] + - - [2560, 66, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 5877.0] + - - [2560, 67, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 5969.0] + - - [2560, 69, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 6139.0] + - - [2560, 70, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 6224.0] + - - [2560, 71, 1, 29000, 2560, 2560, 2560, 29000] + - [122, 6312.0] + - - [2560, 73, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 6484.0] + - - [2560, 74, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 6577.0] + - - [2560, 75, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 6664.0] + - - [2560, 77, 1, 29000, 2560, 2560, 2560, 29000] + - [122, 6843.0] + - - [2560, 78, 1, 29000, 2560, 2560, 2560, 29000] + - [122, 6920.0] + - - [2560, 80, 1, 29000, 2560, 2560, 2560, 29000] + - [122, 7099.0] + - - [2560, 81, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 7179.0] + - - [2560, 82, 1, 29000, 2560, 2560, 2560, 29000] + - [122, 7272.0] + - - [2560, 83, 1, 29000, 2560, 2560, 2560, 29000] + - [122, 7364.0] + - - [2560, 84, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 7449.0] + - - [2560, 88, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 7793.0] + - - [2560, 89, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 7879.0] + - - [2560, 90, 1, 29000, 2560, 2560, 2560, 29000] + - [117, 7961.0] + - - [2560, 92, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 8143.0] + - - [2560, 95, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 8391.0] + - - [2560, 98, 1, 29000, 2560, 2560, 2560, 29000] + - [120, 8672.0] + - - [512, 200, 1, 32, 512, 512, 512, 32] + - [127, 1071.0] + - - [1024, 200, 1, 1, 1024, 1024, 1024, 1] + - [178, 108.0] + - - [512, 200, 1, 1, 512, 512, 512, 1] + - [152, 66.0] + - - [768, 320, 1, 768, 768, 768, 768, 768] + - [128, 4483.0] + - - [768, 160, 1, 768, 768, 768, 768, 768] + - [153, 3985.0] + - - [1024, 120, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 3348.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 4362.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [155, 4529.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [141, 4517.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 128] + - [153, 1667.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [126, 3200.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [126, 3762.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [126, 4077.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [126, 3295.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [179, 4523.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [179, 4362.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [166, 4276.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [126, 4161.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [179, 3200.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [192, 4458.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [141, 4623.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [193, 4386.0] + - - [704, 256, 1, 128, 704, 704, 704, 128] + - [144, 2950.0] + - - [128, 1408, 1, 128, 128, 128, 128, 128] + - [126, 2481.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [153, 3562.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [168, 3670.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [126, 4529.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [128, 3834.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [140, 4161.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [140, 3573.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [128, 3684.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [140, 3349.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [153, 4353.0] + - - [128, 1024, 1, 128, 128, 128, 128, 128] + - [126, 2051.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [192, 3176.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [168, 4596.0] + - - [35, 8457, 1, 1760, 35, 35, 35, 1760] + - [155, 2628.0] + - - [64, 2944, 1, 128, 64, 64, 64, 128] + - [166, 2577.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [153, 4254.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [166, 4070.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [141, 4751.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [140, 2839.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [166, 4292.0] + - - [128, 1856, 1, 128, 128, 128, 128, 128] + - [196, 3456.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [192, 3516.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [140, 3733.0] + - - [35, 8457, 1, 2560, 35, 35, 35, 2560] + - [140, 2682.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 128] + - [179, 3818.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [126, 4454.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [141, 4417.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [126, 3796.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [155, 2041.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [153, 3445.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [140, 4305.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [168, 4658.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [140, 2552.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [155, 3658.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 128] + - [166, 3397.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [168, 3794.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [153, 2240.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [153, 4009.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [140, 4638.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [140, 4215.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [155, 4455.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [141, 3520.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 128] + - [140, 2862.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [126, 4254.0] + - - [64, 4288, 1, 128, 64, 64, 64, 128] + - [166, 3071.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [180, 4256.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [153, 4146.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [140, 4285.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [140, 2853.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [192, 4197.0] + - - [35, 8457, 1, 4096, 35, 35, 35, 4096] + - [192, 2629.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [126, 3621.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 128] + - [128, 3439.0] + - - [256, 1024, 1, 128, 256, 256, 256, 128] + - [126, 3804.0] + - - [64, 1408, 1, 128, 64, 64, 64, 128] + - [166, 2298.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [153, 4153.0] + - - [35, 8457, 1, 2048, 35, 35, 35, 2048] + - [140, 2590.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [192, 3242.0] + - - [448, 256, 1, 128, 448, 448, 448, 128] + - [166, 1984.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [140, 3167.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [192, 3151.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [166, 4120.0] + - - [128, 704, 1, 128, 128, 128, 128, 128] + - [192, 1580.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [140, 4031.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [179, 3979.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [168, 4464.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 128] + - [166, 2089.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [140, 4319.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [179, 2344.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [179, 4097.0] + - - [256, 448, 1, 128, 256, 256, 256, 128] + - [126, 1932.0] + - - [64, 3584, 1, 128, 64, 64, 64, 128] + - [128, 2884.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [141, 2689.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [126, 3969.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [153, 4119.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [140, 2185.0] + - - [64, 1856, 1, 128, 64, 64, 64, 128] + - [192, 2038.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [168, 3119.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [128, 3023.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [126, 4265.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [166, 3523.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [192, 4414.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [179, 3453.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 128] + - [179, 2980.0] + - - [256, 704, 1, 128, 256, 256, 256, 128] + - [153, 2486.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [179, 4407.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [153, 4269.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [168, 2719.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [180, 4415.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [153, 4400.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [179, 4515.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [140, 2807.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [140, 3921.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [168, 2330.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [128, 3495.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [166, 4470.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [179, 3435.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [168, 3708.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [166, 3674.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [144, 3128.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [184, 3661.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [140, 3953.0] + - - [64, 2368, 1, 128, 64, 64, 64, 128] + - [192, 2320.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [192, 4201.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [168, 3801.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [155, 3717.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [140, 2168.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [166, 3937.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [168, 4727.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [153, 4048.0] + - - [448, 448, 1, 128, 448, 448, 448, 128] + - [124, 3260.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [153, 3130.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 1024] + - [153, 4201.0] + - - [512, 200, 1, 512, 512, 512, 512, 512] + - [140, 3070.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [153, 3780.0] + - - [512, 256, 1, 1024, 512, 512, 512, 1024] + - [193, 3610.0] + - - [1024, 256, 1, 2048, 1024, 1024, 1024, 2048] + - [179, 4341.0] + - - [1024, 200, 1, 4096, 1024, 1024, 1024, 4096] + - [179, 3908.0] + - - [1024, 200, 1, 512, 1024, 1024, 1024, 512] + - [126, 3816.0] + - - [512, 200, 1, 1024, 512, 512, 512, 1024] + - [192, 3769.0] + - - [512, 256, 1, 512, 512, 512, 512, 512] + - [168, 3631.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 4096] + - [179, 4367.0] + - - [1024, 200, 1, 2048, 1024, 1024, 1024, 2048] + - [192, 3913.0] + - - [1024, 256, 1, 512, 1024, 1024, 1024, 512] + - [140, 3987.0] + - - [512, 200, 1, 2048, 512, 512, 512, 2048] + - [126, 3752.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [140, 3888.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [192, 3084.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [166, 4584.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [153, 4302.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [126, 3827.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [166, 3605.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [179, 2776.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [140, 3212.0] + - - [512, 512, 1, 1024, 512, 512, 512, 1024] + - [153, 4401.0] + - - [512, 512, 1, 2000, 512, 512, 512, 2000] + - [179, 4308.0] + - - [100, 1024, 1, 2048, 100, 100, 100, 2048] + - [193, 2910.0] + - - [100, 2000, 1, 1024, 100, 100, 100, 1024] + - [166, 3205.0] + - - [128, 2000, 1, 100, 128, 128, 128, 100] + - [179, 2623.0] + - - [64, 2000, 1, 1024, 64, 64, 64, 1024] + - [193, 3426.0] + - - [100, 1024, 1, 1024, 100, 100, 100, 1024] + - [141, 2774.0] + - - [128, 1024, 1, 512, 128, 128, 128, 512] + - [193, 3226.0] + - - [512, 500, 1, 2000, 512, 512, 512, 2000] + - [153, 4250.0] + - - [500, 512, 1, 100, 500, 500, 500, 100] + - [126, 2623.0] + - - [100, 1024, 1, 500, 100, 100, 100, 500] + - [193, 2485.0] + - - [128, 2000, 1, 512, 128, 128, 128, 512] + - [192, 3885.0] + - - [256, 1024, 1, 100, 256, 256, 256, 100] + - [153, 2777.0] + - - [200, 500, 1, 1024, 200, 200, 200, 1024] + - [138, 2936.0] + - - [100, 2000, 1, 512, 100, 100, 100, 512] + - [166, 3028.0] + - - [200, 512, 1, 100, 200, 200, 200, 100] + - [151, 1438.0] + - - [64, 2048, 1, 10, 64, 64, 64, 10] + - [149, 392.0] + - - [64, 2048, 1, 500, 64, 64, 64, 500] + - [155, 3163.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [126, 4011.0] + - - [500, 500, 1, 2000, 500, 500, 500, 2000] + - [153, 4147.0] + - - [256, 500, 1, 10, 256, 256, 256, 10] + - [152, 282.0] + - - [512, 500, 1, 512, 512, 512, 512, 512] + - [153, 3901.0] + - - [128, 1024, 1, 2000, 128, 128, 128, 2000] + - [128, 3908.0] + - - [100, 2000, 1, 2048, 100, 100, 100, 2048] + - [140, 3282.0] + - - [256, 512, 1, 10, 256, 256, 256, 10] + - [192, 293.0] + - - [64, 2000, 1, 2048, 64, 64, 64, 2048] + - [166, 3656.0] + - - [64, 2048, 1, 512, 64, 64, 64, 512] + - [141, 3220.0] + - - [64, 2000, 1, 10, 64, 64, 64, 10] + - [125, 287.0] + - - [128, 1024, 1, 500, 128, 128, 128, 500] + - [193, 3178.0] + - - [200, 512, 1, 1024, 200, 200, 200, 1024] + - [190, 3003.0] + - - [128, 2048, 1, 10, 128, 128, 128, 10] + - [179, 535.0] + - - [64, 2048, 1, 100, 64, 64, 64, 100] + - [126, 1820.0] + - - [64, 2000, 1, 100, 64, 64, 64, 100] + - [180, 1667.0] + - - [200, 500, 1, 100, 200, 200, 200, 100] + - [177, 1377.0] + - - [500, 500, 1, 500, 500, 500, 500, 500] + - [153, 3765.0] + - - [128, 2048, 1, 512, 128, 128, 128, 512] + - [192, 3973.0] + - - [100, 2048, 1, 500, 100, 100, 100, 500] + - [179, 3086.0] + - - [500, 500, 1, 2048, 500, 500, 500, 2048] + - [179, 4116.0] + - - [128, 2000, 1, 2000, 128, 128, 128, 2000] + - [179, 4222.0] + - - [256, 500, 1, 1024, 256, 256, 256, 1024] + - [168, 3514.0] + - - [64, 2048, 1, 2000, 64, 64, 64, 2000] + - [180, 3775.0] + - - [100, 2048, 1, 1024, 100, 100, 100, 1024] + - [192, 3263.0] + - - [128, 1024, 1, 100, 128, 128, 128, 100] + - [168, 1752.0] + - - [256, 1024, 1, 2048, 256, 256, 256, 2048] + - [153, 4314.0] + - - [500, 512, 1, 512, 500, 500, 500, 512] + - [153, 3880.0] + - - [256, 500, 1, 2000, 256, 256, 256, 2000] + - [155, 3716.0] + - - [256, 512, 1, 100, 256, 256, 256, 100] + - [155, 1725.0] + - - [128, 2000, 1, 500, 128, 128, 128, 500] + - [179, 3846.0] + - - [200, 512, 1, 2048, 200, 200, 200, 2048] + - [164, 3191.0] + - - [64, 2048, 1, 2048, 64, 64, 64, 2048] + - [168, 3724.0] + - - [200, 1024, 1, 2048, 200, 200, 200, 2048] + - [166, 3383.0] + - - [512, 512, 1, 10, 512, 512, 512, 10] + - [188, 627.0] + - - [512, 500, 1, 10, 512, 512, 512, 10] + - [179, 514.0] + - - [200, 512, 1, 10, 200, 200, 200, 10] + - [129, 236.0] + - - [500, 500, 1, 1024, 500, 500, 500, 1024] + - [179, 3994.0] + - - [256, 1024, 1, 512, 256, 256, 256, 512] + - [153, 3992.0] + - - [256, 500, 1, 512, 256, 256, 256, 512] + - [193, 3139.0] + - - [200, 500, 1, 2048, 200, 200, 200, 2048] + - [138, 3139.0] + - - [100, 2000, 1, 10, 100, 100, 100, 10] + - [153, 398.0] + - - [100, 2048, 1, 2048, 100, 100, 100, 2048] + - [166, 3358.0] + - - [128, 1024, 1, 2048, 128, 128, 128, 2048] + - [168, 3704.0] + - - [100, 2000, 1, 500, 100, 100, 100, 500] + - [179, 3014.0] + - - [100, 2048, 1, 100, 100, 100, 100, 100] + - [153, 2065.0] + - - [100, 1024, 1, 10, 100, 100, 100, 10] + - [159, 225.0] + - - [100, 1024, 1, 2000, 100, 100, 100, 2000] + - [128, 2943.0] + - - [256, 512, 1, 500, 256, 256, 256, 500] + - [155, 3188.0] + - - [100, 2000, 1, 100, 100, 100, 100, 100] + - [153, 2045.0] + - - [128, 1024, 1, 10, 128, 128, 128, 10] + - [184, 301.0] + - - [100, 2048, 1, 10, 100, 100, 100, 10] + - [157, 402.0] + - - [512, 500, 1, 100, 512, 512, 512, 100] + - [140, 2639.0] + - - [128, 2000, 1, 1024, 128, 128, 128, 1024] + - [140, 4088.0] + - - [200, 1024, 1, 500, 200, 200, 200, 500] + - [126, 3084.0] + - - [256, 512, 1, 2000, 256, 256, 256, 2000] + - [128, 3770.0] + - - [256, 1024, 1, 2000, 256, 256, 256, 2000] + - [179, 4331.0] + - - [200, 512, 1, 500, 200, 200, 200, 500] + - [177, 2689.0] + - - [64, 2000, 1, 512, 64, 64, 64, 512] + - [140, 3060.0] + - - [200, 1024, 1, 100, 200, 200, 200, 100] + - [126, 2107.0] + - - [200, 1024, 1, 1024, 200, 200, 200, 1024] + - [192, 3269.0] + - - [500, 512, 1, 2000, 500, 500, 500, 2000] + - [153, 4222.0] + - - [200, 500, 1, 512, 200, 200, 200, 512] + - [190, 2583.0] + - - [256, 512, 1, 512, 256, 256, 256, 512] + - [141, 3245.0] + - - [512, 512, 1, 500, 512, 512, 512, 500] + - [153, 3962.0] + - - [100, 1024, 1, 512, 100, 100, 100, 512] + - [193, 2494.0] + - - [128, 1024, 1, 1024, 128, 128, 128, 1024] + - [193, 3568.0] + - - [200, 512, 1, 2000, 200, 200, 200, 2000] + - [124, 3241.0] + - - [256, 1024, 1, 500, 256, 256, 256, 500] + - [126, 3979.0] + - - [200, 1024, 1, 512, 200, 200, 200, 512] + - [179, 3123.0] + - - [256, 500, 1, 500, 256, 256, 256, 500] + - [141, 3137.0] + - - [256, 500, 1, 2048, 256, 256, 256, 2048] + - [128, 3690.0] + - - [512, 500, 1, 1024, 512, 512, 512, 1024] + - [153, 4115.0] + - - [256, 512, 1, 1024, 256, 256, 256, 1024] + - [168, 3581.0] + - - [128, 2048, 1, 1024, 128, 128, 128, 1024] + - [140, 4344.0] + - - [500, 512, 1, 500, 500, 500, 500, 500] + - [179, 4150.0] + - - [200, 500, 1, 500, 200, 200, 200, 500] + - [124, 3016.0] + - - [64, 2000, 1, 2000, 64, 64, 64, 2000] + - [126, 3795.0] + - - [128, 2000, 1, 2048, 128, 128, 128, 2048] + - [140, 4306.0] + - - [256, 1024, 1, 10, 256, 256, 256, 10] + - [141, 868.0] + - - [256, 1024, 1, 1024, 256, 256, 256, 1024] + - [166, 4200.0] + - - [500, 500, 1, 10, 500, 500, 500, 10] + - [125, 483.0] + - - [256, 500, 1, 100, 256, 256, 256, 100] + - [153, 1730.0] + - - [256, 512, 1, 2048, 256, 256, 256, 2048] + - [128, 3753.0] + - - [200, 1024, 1, 2000, 200, 200, 200, 2000] + - [179, 3465.0] + - - [100, 2048, 1, 512, 100, 100, 100, 512] + - [192, 3312.0] + - - [512, 500, 1, 2048, 512, 512, 512, 2048] + - [153, 4201.0] + - - [128, 2048, 1, 2000, 128, 128, 128, 2000] + - [153, 4289.0] + - - [500, 512, 1, 2048, 500, 500, 500, 2048] + - [179, 4302.0] + - - [200, 500, 1, 2000, 200, 200, 200, 2000] + - [124, 3223.0] + - - [500, 512, 1, 1024, 500, 500, 500, 1024] + - [126, 4101.0] + - - [100, 1024, 1, 100, 100, 100, 100, 100] + - [141, 1969.0] + - - [64, 2000, 1, 500, 64, 64, 64, 500] + - [128, 3463.0] + - - [128, 2048, 1, 2048, 128, 128, 128, 2048] + - [166, 4337.0] + - - [128, 2000, 1, 10, 128, 128, 128, 10] + - [184, 914.0] + - - [500, 512, 1, 10, 500, 500, 500, 10] + - [125, 970.0] + - - [200, 512, 1, 512, 200, 200, 200, 512] + - [138, 2936.0] + - - [512, 500, 1, 500, 512, 512, 512, 500] + - [153, 4197.0] + - - [512, 512, 1, 100, 512, 512, 512, 100] + - [192, 2754.0] + - - [500, 500, 1, 512, 500, 500, 500, 512] + - [153, 3769.0] + - - [128, 2048, 1, 500, 128, 128, 128, 500] + - [179, 3943.0] + - - [200, 500, 1, 10, 200, 200, 200, 10] + - [123, 242.0] + - - [100, 2048, 1, 2000, 100, 100, 100, 2000] + - [140, 3405.0] + - - [200, 1024, 1, 10, 200, 200, 200, 10] + - [126, 430.0] + - - [64, 2048, 1, 1024, 64, 64, 64, 1024] + - [193, 3554.0] + - - [100, 2000, 1, 2000, 100, 100, 100, 2000] + - [126, 3317.0] + - - [500, 500, 1, 100, 500, 500, 500, 100] + - [179, 2588.0] + - - [128, 2048, 1, 100, 128, 128, 128, 100] + - [179, 2731.0] + - - [4096, 64, 1, 2048, 4096, 4096, 4096, 2048] + - [140, 4321.0] + - - [4096, 91, 1, 2048, 4096, 4096, 4096, 2048] + - [166, 4424.0] + - - [4096, 86, 1, 3072, 4096, 4096, 4096, 3072] + - [126, 4262.0] + - - [4096, 49, 1, 2048, 4096, 4096, 4096, 2048] + - [179, 3282.0] + - - [4096, 91, 1, 3072, 4096, 4096, 4096, 3072] + - [166, 4488.0] + - - [4096, 64, 1, 3072, 4096, 4096, 4096, 3072] + - [140, 4312.0] + - - [4096, 63, 1, 3072, 4096, 4096, 4096, 3072] + - [192, 4198.0] + - - [4096, 96, 1, 2048, 4096, 4096, 4096, 2048] + - [179, 4670.0] + - - [4096, 32, 1, 2048, 4096, 4096, 4096, 2048] + - [153, 3657.0] + - - [4096, 49, 1, 3072, 4096, 4096, 4096, 3072] + - [140, 3256.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 3408.0] + - - [4096, 86, 1, 2048, 4096, 4096, 4096, 2048] + - [153, 4191.0] + - - [4096, 96, 1, 3072, 4096, 4096, 4096, 3072] + - [179, 4717.0] + - - [4096, 35, 1, 3072, 4096, 4096, 4096, 3072] + - [179, 2352.0] + - - [4096, 50, 1, 2048, 4096, 4096, 4096, 2048] + - [126, 3368.0] + - - [36548, 32, 1, 1024, 36548, 36548, 36548, 1024] + - [166, 4829.0] + - - [4096, 32, 1, 3072, 4096, 4096, 4096, 3072] + - [126, 3691.0] + - - [1024, 243, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 4127.0] + - - [4096, 50, 1, 3072, 4096, 4096, 4096, 3072] + - [126, 3312.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 3606.0] + - - [1024, 216, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 4046.0] + - - [4096, 35, 1, 2048, 4096, 4096, 4096, 2048] + - [126, 2314.0] + - - [4096, 63, 1, 2048, 4096, 4096, 4096, 2048] + - [153, 4113.0] + - - [289, 256, 1, 1568, 289, 289, 289, 1568] + - [192, 3213.0] + - - [3025, 64, 1, 363, 3025, 3025, 3025, 363] + - [153, 3445.0] + - - [784, 32, 32, 192, 784, 784, 784, 192] + - [140, 4473.0] + - - [289, 256, 1, 2016, 289, 289, 289, 2016] + - [131, 3212.0] + - - [21609, 32, 1, 288, 21609, 21609, 21609, 288] + - [153, 4495.0] + - - [1225, 192, 1, 1728, 1225, 1225, 1225, 1728] + - [179, 4433.0] + - - [784, 96, 1, 800, 784, 784, 784, 800] + - [192, 2832.0] + - - [1225, 64, 1, 1200, 1225, 1225, 1225, 1200] + - [131, 3189.0] + - - [729, 192, 1, 1600, 729, 729, 729, 1600] + - [128, 4005.0] + - - [6272, 32, 1, 528, 6272, 6272, 6272, 528] + - [126, 3826.0] + - - [1568, 160, 1, 832, 1568, 1568, 1568, 832] + - [179, 3985.0] + - - [289, 256, 1, 1792, 289, 289, 289, 1792] + - [158, 3190.0] + - - [784, 32, 32, 256, 784, 784, 784, 256] + - [126, 4386.0] + - - [6272, 32, 1, 512, 6272, 6272, 6272, 512] + - [179, 3829.0] + - - [289, 384, 1, 3456, 289, 289, 289, 3456] + - [126, 4209.0] + - - [289, 384, 1, 2592, 289, 289, 289, 2592] + - [192, 4182.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 192] + - [166, 4595.0] + - - [1568, 128, 1, 832, 1568, 1568, 1568, 832] + - [179, 4413.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 288] + - [179, 3734.0] + - - [1001, 128, 1, 2048, 1001, 1001, 1001, 2048] + - [128, 3799.0] + - - [2048, 174, 1, 512, 2048, 2048, 2048, 512] + - [179, 4001.0] + - - [2048, 189, 1, 512, 2048, 2048, 2048, 512] + - [192, 4316.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [179, 2334.0] + - - [64, 103, 16, 103, 64, 64, 64, 103] + - [123, 1692.0] + - - [64, 104, 16, 103, 64, 64, 64, 103] + - [126, 2109.0] + - - [64, 123, 16, 112, 64, 64, 64, 112] + - [155, 1837.0] + - - [64, 123, 16, 123, 64, 64, 64, 123] + - [152, 2060.0] + - - [512, 540, 1, 512, 512, 512, 512, 512] + - [140, 4154.0] + - - [512, 540, 1, 2048, 512, 512, 512, 2048] + - [126, 4496.0] + - - [512, 550, 1, 512, 512, 512, 512, 512] + - [128, 3888.0] + - - [512, 550, 1, 2048, 512, 512, 512, 2048] + - [153, 4248.0] + - - [512, 560, 1, 512, 512, 512, 512, 512] + - [155, 3959.0] + - - [512, 560, 1, 2048, 512, 512, 512, 2048] + - [128, 4238.0] + - - [2048, 160, 1, 512, 2048, 2048, 2048, 512] + - [192, 4376.0] + - - [2048, 184, 1, 512, 2048, 2048, 2048, 512] + - [192, 4242.0] + - - [512, 160, 1, 2048, 512, 512, 512, 2048] + - [158, 3564.0] + - - [512, 174, 1, 2048, 512, 512, 512, 2048] + - [126, 3263.0] + - - [512, 182, 1, 512, 512, 512, 512, 512] + - [166, 2771.0] + - - [512, 184, 1, 512, 512, 512, 512, 512] + - [126, 2817.0] + - - [512, 184, 1, 2048, 512, 512, 512, 2048] + - [126, 3476.0] + - - [512, 189, 1, 512, 512, 512, 512, 512] + - [166, 2887.0] + - - [512, 189, 1, 2048, 512, 512, 512, 2048] + - [126, 3553.0] + - - [512, 198, 1, 2048, 512, 512, 512, 2048] + - [179, 3706.0] + - - [512, 206, 1, 512, 512, 512, 512, 512] + - [192, 3154.0] + - - [512, 207, 1, 2048, 512, 512, 512, 2048] + - [179, 3865.0] + - - [512, 208, 1, 512, 512, 512, 512, 512] + - [192, 3152.0] + - - [512, 208, 1, 2048, 512, 512, 512, 2048] + - [153, 3875.0] + - - [512, 224, 1, 512, 512, 512, 512, 512] + - [166, 3446.0] + - - [512, 245, 1, 2048, 512, 512, 512, 2048] + - [128, 3605.0] + - - [512, 246, 1, 512, 512, 512, 512, 512] + - [193, 3112.0] + - - [512, 246, 1, 2048, 512, 512, 512, 2048] + - [128, 3590.0] + - - [512, 264, 1, 512, 512, 512, 512, 512] + - [193, 3283.0] + - - [512, 264, 1, 2048, 512, 512, 512, 2048] + - [128, 3853.0] + - - [512, 401, 1, 2048, 512, 512, 512, 2048] + - [128, 3921.0] + - - [512, 439, 1, 2048, 512, 512, 512, 2048] + - [128, 4285.0] + - - [512, 443, 1, 2048, 512, 512, 512, 2048] + - [193, 4310.0] + - - [512, 446, 1, 2048, 512, 512, 512, 2048] + - [168, 4337.0] + - - [512, 455, 1, 512, 512, 512, 512, 512] + - [126, 3986.0] + - - [512, 465, 1, 512, 512, 512, 512, 512] + - [179, 4074.0] + - - [512, 465, 1, 2048, 512, 512, 512, 2048] + - [126, 4453.0] + - - [512, 468, 1, 512, 512, 512, 512, 512] + - [153, 4089.0] + - - [512, 468, 1, 2048, 512, 512, 512, 2048] + - [126, 4447.0] + - - [512, 476, 1, 512, 512, 512, 512, 512] + - [153, 4154.0] + - - [512, 493, 1, 512, 512, 512, 512, 512] + - [153, 3828.0] + - - [512, 493, 1, 2048, 512, 512, 512, 2048] + - [179, 4143.0] + - - [512, 495, 1, 2048, 512, 512, 512, 2048] + - [153, 4191.0] + - - [512, 511, 1, 2048, 512, 512, 512, 2048] + - [179, 4328.0] + - - [512, 512, 1, 2048, 512, 512, 512, 2048] + - [179, 4338.0] + - - [64, 59, 512, 59, 64, 64, 64, 59] + - [184, 3463.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [158, 3738.0] + - - [256, 1024, 1, 1, 256, 256, 256, 1] + - [178, 62.0] + - - [257, 1024, 1, 4096, 257, 257, 257, 4096] + - [128, 3930.0] + - - [512, 215, 1, 2048, 512, 512, 512, 2048] + - [126, 4033.0] + - - [512, 256, 1, 2048, 512, 512, 512, 2048] + - [128, 3785.0] + - - [560, 200, 1, 1024, 560, 560, 560, 1024] + - [128, 3053.0] + - - [768, 215, 1, 2048, 768, 768, 768, 2048] + - [179, 3704.0] + - - [768, 256, 1, 2048, 768, 768, 768, 2048] + - [179, 4416.0] + - - [32, 33, 1600, 33, 32, 32, 32, 33] + - [138, 1943.0] + - - [512, 512, 1, 64, 512, 512, 512, 64] + - [153, 2261.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 192] + - [179, 4948.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 192] + - [126, 3796.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 256] + - [166, 3858.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 288] + - [126, 3820.0] + - - [49, 2048, 64, 512, 49, 49, 49, 512] + - [140, 4111.0] + - - [49, 512, 64, 2048, 49, 49, 49, 2048] + - [140, 4068.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 192] + - [153, 3727.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 256] + - [126, 3741.0] + - - [49, 2048, 32, 512, 49, 49, 49, 512] + - [140, 4086.0] + - - [49, 512, 32, 2048, 49, 49, 49, 2048] + - [140, 3919.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [193, 3852.0] + - - [100, 128, 18, 512, 100, 100, 100, 512] + - [153, 3368.0] + - - [100, 128, 19, 512, 100, 100, 100, 512] + - [141, 3478.0] + - - [1444, 128, 1, 576, 1444, 1444, 1444, 576] + - [153, 3710.0] + - - [361, 512, 1, 2304, 361, 361, 361, 2304] + - [179, 4111.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [153, 3725.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [140, 4222.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [193, 4496.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 3959.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [131, 4052.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [140, 2716.0] + - - [64, 35, 4608, 32, 64, 64, 64, 32] + - [140, 2905.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [140, 2766.0] + - - [256, 864, 1, 128, 256, 256, 256, 128] + - [140, 3419.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 1024] + - [140, 4129.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 2048] + - [140, 4117.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 1024] + - [192, 4102.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 2048] + - [166, 4069.0] + - - [3136, 64, 1, 576, 3136, 3136, 3136, 576] + - [153, 4198.0] + - - [784, 128, 1, 1152, 784, 784, 784, 1152] + - [153, 3768.0] + - - [49, 2048, 128, 512, 49, 49, 49, 512] + - [166, 4135.0] + - - [49, 2048, 256, 512, 49, 49, 49, 512] + - [166, 4142.0] + - - [49, 512, 128, 2048, 49, 49, 49, 2048] + - [166, 4116.0] + - - [49, 512, 256, 2048, 49, 49, 49, 2048] + - [192, 4144.0] + - - [1024, 128, 1, 2, 1024, 1024, 1024, 2] + - [152, 142.0] + - - [1024, 96, 1, 2, 1024, 1024, 1024, 2] + - [152, 117.0] + - - [1909283, 40, 1, 40, 1909283, 1909283, 1909283, 40] + - [180, 2809.0] + - - [3818566, 40, 1, 40, 3818566, 3818566, 3818566, 40] + - [184, 2787.0] + - - [2560, 35, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 2599.0] + - - [2560, 36, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 2680.0] + - - [2560, 39, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 2910.0] + - - [2560, 40, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 2986.0] + - - [2560, 42, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3135.0] + - - [2560, 43, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3208.0] + - - [2560, 44, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3286.0] + - - [2560, 46, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3441.0] + - - [2560, 48, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3586.0] + - - [2560, 49, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3662.0] + - - [2560, 50, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 3736.0] + - - [2560, 51, 1, 29000, 2560, 2560, 2560, 29000] + - [140, 3803.0] + - - [2560, 53, 1, 29000, 2560, 2560, 2560, 29000] + - [140, 3936.0] + - - [2560, 54, 1, 29000, 2560, 2560, 2560, 29000] + - [140, 4023.0] + - - [2560, 55, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 4082.0] + - - [2560, 56, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 4174.0] + - - [2560, 57, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 4255.0] + - - [2560, 58, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 4344.0] + - - [2560, 59, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 4421.0] + - - [2560, 61, 1, 29000, 2560, 2560, 2560, 29000] + - [140, 4518.0] + - - [2560, 63, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 4686.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [218, 3039.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 1280] + - [212, 1069.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [229, 2908.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 256] + - [201, 511.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 3328] + - [216, 1623.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [224, 2223.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 1280] + - [210, 763.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 1280] + - [227, 1617.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [216, 1214.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 1280] + - [237, 643.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [238, 302.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [234, 933.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [207, 3170.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [238, 450.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 256] + - [204, 454.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [234, 229.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [213, 608.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [236, 1185.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [201, 108.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [235, 2841.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 256] + - [227, 737.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 3328] + - [213, 1292.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 1280] + - [236, 1330.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [234, 2089.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 128] + - [204, 471.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [241, 291.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 256] + - [241, 286.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [236, 1675.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [220, 193.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 1280] + - [236, 1157.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [206, 2339.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 3328] + - [237, 505.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [205, 398.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [235, 3521.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [206, 2349.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 128] + - [208, 125.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 256] + - [228, 376.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 128] + - [205, 531.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 128] + - [219, 265.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 3328] + - [234, 809.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 128] + - [231, 406.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [212, 768.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 3328] + - [225, 1059.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 128] + - [233, 203.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 128] + - [216, 165.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [234, 473.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [202, 1881.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [214, 85.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [227, 561.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 1280] + - [237, 491.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 256] + - [216, 1069.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 128] + - [216, 370.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 3328] + - [211, 662.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 256] + - [216, 473.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 3328] + - [244, 898.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 3328] + - [228, 586.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 1280] + - [236, 1015.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 128] + - [201, 466.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 256] + - [233, 624.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 3328] + - [225, 1522.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [232, 3098.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [205, 844.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 1280] + - [213, 798.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 256] + - [201, 903.0] + - - [4096, 29, 1, 2048, 4096, 4096, 4096, 2048] + - [223, 2989.0] + - - [4096, 25, 1, 2048, 4096, 4096, 4096, 2048] + - [223, 2583.0] + - - [4096, 29, 1, 3072, 4096, 4096, 4096, 3072] + - [242, 3039.0] + - - [4096, 24, 1, 2048, 4096, 4096, 4096, 2048] + - [242, 3477.0] + - - [36548, 1, 1, 1024, 36548, 36548, 36548, 1024] + - [231, 205.0] + - - [4096, 27, 1, 2048, 4096, 4096, 4096, 2048] + - [223, 2869.0] + - - [4096, 1, 1, 2048, 4096, 4096, 4096, 2048] + - [236, 352.0] + - - [4096, 24, 1, 3072, 4096, 4096, 4096, 3072] + - [223, 3545.0] + - - [4096, 27, 1, 3072, 4096, 4096, 4096, 3072] + - [223, 2832.0] + - - [36548, 25, 1, 1024, 36548, 36548, 36548, 1024] + - [232, 3145.0] + - - [4096, 1, 1, 3072, 4096, 4096, 4096, 3072] + - [225, 350.0] + - - [4096, 25, 1, 3072, 4096, 4096, 4096, 3072] + - [223, 2705.0] + - - [36548, 24, 1, 1024, 36548, 36548, 36548, 1024] + - [235, 3236.0] + - - [6272, 16, 1, 480, 6272, 6272, 6272, 480] + - [235, 3182.0] + - - [1568, 32, 1, 832, 1568, 1568, 1568, 832] + - [209, 2639.0] + - - [1568, 48, 1, 832, 1568, 1568, 1568, 832] + - [223, 3195.0] + - - [6272, 24, 1, 512, 6272, 6272, 6272, 512] + - [209, 3368.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 512] + - [224, 132.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [157, 5.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [230, 372.0] + - - [2560, 4, 1, 2, 2560, 2560, 2560, 2] + - [178, 13.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [243, 953.0] + - - [12288, 12, 2, 256, 12288, 12288, 12288, 256] + - [207, 2651.0] + - - [12288, 3, 2, 256, 12288, 12288, 12288, 256] + - [213, 997.0] + - - [51520, 12, 2, 256, 51520, 51520, 51520, 256] + - [240, 3024.0] + - - [51520, 3, 2, 256, 51520, 51520, 51520, 256] + - [216, 1053.0] + - - [15200, 12, 2, 256, 15200, 15200, 15200, 256] + - [218, 2784.0] + - - [15200, 3, 2, 256, 15200, 15200, 15200, 256] + - [205, 1280.0] + - - [3456, 3, 2, 256, 3456, 3456, 3456, 256] + - [205, 785.0] + - - [13600, 12, 2, 256, 13600, 13600, 13600, 256] + - [203, 2692.0] + - - [12880, 3, 2, 256, 12880, 12880, 12880, 256] + - [239, 987.0] + - - [3400, 3, 2, 256, 3400, 3400, 3400, 256] + - [205, 569.0] + - - [12880, 12, 2, 256, 12880, 12880, 12880, 256] + - [203, 2640.0] + - - [13824, 12, 2, 256, 13824, 13824, 13824, 256] + - [222, 2722.0] + - - [13824, 3, 2, 256, 13824, 13824, 13824, 256] + - [233, 1003.0] + - - [13600, 3, 2, 256, 13600, 13600, 13600, 256] + - [215, 1047.0] + - - [3456, 12, 2, 256, 3456, 3456, 3456, 256] + - [207, 1643.0] + - - [3800, 3, 2, 256, 3800, 3800, 3800, 256] + - [221, 593.0] + - - [3400, 12, 2, 256, 3400, 3400, 3400, 256] + - [222, 1629.0] + - - [3800, 12, 2, 256, 3800, 3800, 3800, 256] + - [207, 1790.0] + - - [55296, 3, 2, 256, 55296, 55296, 55296, 256] + - [215, 758.0] + - - [3220, 3, 2, 256, 3220, 3220, 3220, 256] + - [205, 707.0] + - - [3072, 3, 2, 256, 3072, 3072, 3072, 256] + - [216, 822.0] + - - [3220, 12, 2, 256, 3220, 3220, 3220, 256] + - [222, 1959.0] + - - [3072, 12, 2, 256, 3072, 3072, 3072, 256] + - [235, 2215.0] + - - [54400, 3, 2, 256, 54400, 54400, 54400, 256] + - [239, 821.0] + - - [60800, 12, 2, 256, 60800, 60800, 60800, 256] + - [217, 2589.0] + - - [60800, 3, 2, 256, 60800, 60800, 60800, 256] + - [215, 613.0] + - - [1909283, 11, 1, 11, 1909283, 1909283, 1909283, 11] + - [226, 1139.0] + - - [3818566, 11, 1, 11, 3818566, 3818566, 3818566, 11] + - [226, 974.0] + - - [2048, 8, 1, 2, 2048, 2048, 2048, 2] + - [201, 20.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [211, 1450.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [201, 7.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [211, 471.0] + - - [2560, 27, 1, 29000, 2560, 2560, 2560, 29000] + - [209, 2929.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 3328] + - [142, 491.0] + - - [35, 1500, 1, 2560, 35, 35, 35, 2560] + - [260, 1554.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 1280] + - [137, 458.0] + - - [4, 3584, 1, 128, 4, 4, 4, 128] + - [252, 273.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 3328] + - [169, 387.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 3328] + - [163, 553.0] + - - [4, 4288, 1, 128, 4, 4, 4, 128] + - [252, 434.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 1280] + - [259, 597.0] + - - [4, 5056, 1, 256, 4, 4, 4, 256] + - [247, 417.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 3328] + - [163, 539.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 1280] + - [247, 597.0] + - - [35, 1500, 1, 2048, 35, 35, 35, 2048] + - [248, 1536.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 3328] + - [189, 528.0] + - - [4, 1856, 1, 256, 4, 4, 4, 256] + - [189, 249.0] + - - [4, 2944, 1, 256, 4, 4, 4, 256] + - [137, 320.0] + - - [4, 6784, 1, 128, 4, 4, 4, 128] + - [259, 370.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 1280] + - [252, 547.0] + - - [4, 5888, 1, 256, 4, 4, 4, 256] + - [247, 423.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 3328] + - [142, 436.0] + - - [4, 6784, 1, 256, 4, 4, 4, 256] + - [142, 406.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1280] + - [142, 339.0] + - - [4, 3584, 1, 256, 4, 4, 4, 256] + - [247, 358.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 1280] + - [137, 489.0] + - - [4, 1408, 1, 256, 4, 4, 4, 256] + - [142, 189.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 3328] + - [252, 600.0] + - - [4, 2368, 1, 128, 4, 4, 4, 128] + - [247, 191.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 1280] + - [247, 592.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1280] + - [169, 416.0] + - - [4, 1856, 1, 128, 4, 4, 4, 128] + - [252, 164.0] + - - [4, 2944, 1, 128, 4, 4, 4, 128] + - [247, 226.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 1280] + - [247, 559.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 3328] + - [252, 652.0] + - - [4, 5056, 1, 128, 4, 4, 4, 128] + - [247, 488.0] + - - [4, 4288, 1, 256, 4, 4, 4, 256] + - [260, 461.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3328] + - [247, 614.0] + - - [4, 2368, 1, 256, 4, 4, 4, 256] + - [259, 286.0] + - - [4, 5888, 1, 128, 4, 4, 4, 128] + - [259, 354.0] + - - [4, 1408, 1, 128, 4, 4, 4, 128] + - [247, 122.0] + - - [16, 2000, 1, 2048, 16, 16, 16, 2048] + - [169, 1624.0] + - - [2, 2048, 1, 2000, 2, 2, 2, 2000] + - [251, 215.0] + - - [32, 2000, 1, 2048, 32, 32, 32, 2048] + - [254, 2273.0] + - - [10, 2000, 1, 1024, 10, 10, 10, 1024] + - [189, 961.0] + - - [2, 2000, 1, 100, 2, 2, 2, 100] + - [129, 70.0] + - - [10, 2000, 1, 512, 10, 10, 10, 512] + - [137, 824.0] + - - [32, 2000, 1, 500, 32, 32, 32, 500] + - [246, 2222.0] + - - [32, 2000, 1, 1024, 32, 32, 32, 1024] + - [139, 2202.0] + - - [4, 2048, 1, 500, 4, 4, 4, 500] + - [251, 376.0] + - - [16, 2000, 1, 500, 16, 16, 16, 500] + - [137, 1258.0] + - - [4, 2048, 1, 100, 4, 4, 4, 100] + - [123, 140.0] + - - [16, 2000, 1, 100, 16, 16, 16, 100] + - [137, 548.0] + - - [4, 2000, 1, 10, 4, 4, 4, 10] + - [245, 40.0] + - - [10, 2000, 1, 10, 10, 10, 10, 10] + - [143, 49.0] + - - [2, 2048, 1, 512, 2, 2, 2, 512] + - [252, 169.0] + - - [10, 2048, 1, 100, 10, 10, 10, 100] + - [137, 344.0] + - - [8, 2048, 1, 100, 8, 8, 8, 100] + - [129, 277.0] + - - [2, 2048, 1, 1024, 2, 2, 2, 1024] + - [247, 207.0] + - - [16, 2000, 1, 1024, 16, 16, 16, 1024] + - [189, 1533.0] + - - [10, 2000, 1, 2000, 10, 10, 10, 2000] + - [137, 1041.0] + - - [8, 2000, 1, 500, 8, 8, 8, 500] + - [137, 638.0] + - - [16, 2000, 1, 2000, 16, 16, 16, 2000] + - [189, 1668.0] + - - [10, 2048, 1, 2048, 10, 10, 10, 2048] + - [169, 1038.0] + - - [8, 2000, 1, 512, 8, 8, 8, 512] + - [247, 659.0] + - - [2, 2000, 1, 2048, 2, 2, 2, 2048] + - [247, 213.0] + - - [16, 2048, 1, 500, 16, 16, 16, 500] + - [123, 1296.0] + - - [8, 2048, 1, 1024, 8, 8, 8, 1024] + - [247, 822.0] + - - [2, 2000, 1, 500, 2, 2, 2, 500] + - [123, 157.0] + - - [32, 2048, 1, 100, 32, 32, 32, 100] + - [245, 1050.0] + - - [10, 2048, 1, 500, 10, 10, 10, 500] + - [137, 810.0] + - - [4, 2000, 1, 2048, 4, 4, 4, 2048] + - [137, 408.0] + - - [8, 2000, 1, 1024, 8, 8, 8, 1024] + - [252, 797.0] + - - [32, 2048, 1, 512, 32, 32, 32, 512] + - [250, 1965.0] + - - [32, 2048, 1, 1024, 32, 32, 32, 1024] + - [249, 2203.0] + - - [32, 2048, 1, 500, 32, 32, 32, 500] + - [256, 1981.0] + - - [10, 2048, 1, 1024, 10, 10, 10, 1024] + - [137, 986.0] + - - [8, 2048, 1, 2048, 8, 8, 8, 2048] + - [142, 830.0] + - - [16, 2048, 1, 2048, 16, 16, 16, 2048] + - [169, 1661.0] + - - [8, 2000, 1, 10, 8, 8, 8, 10] + - [129, 40.0] + - - [4, 2000, 1, 2000, 4, 4, 4, 2000] + - [251, 424.0] + - - [8, 2048, 1, 512, 8, 8, 8, 512] + - [247, 681.0] + - - [8, 2000, 1, 2048, 8, 8, 8, 2048] + - [142, 808.0] + - - [32, 2048, 1, 2000, 32, 32, 32, 2000] + - [246, 2394.0] + - - [16, 2000, 1, 10, 16, 16, 16, 10] + - [127, 80.0] + - - [8, 2048, 1, 2000, 8, 8, 8, 2000] + - [245, 856.0] + - - [4, 2048, 1, 2048, 4, 4, 4, 2048] + - [142, 416.0] + - - [10, 2048, 1, 2000, 10, 10, 10, 2000] + - [137, 1061.0] + - - [8, 2000, 1, 100, 8, 8, 8, 100] + - [163, 289.0] + - - [2, 2000, 1, 2000, 2, 2, 2, 2000] + - [176, 222.0] + - - [16, 2048, 1, 1024, 16, 16, 16, 1024] + - [189, 1583.0] + - - [32, 2000, 1, 2000, 32, 32, 32, 2000] + - [246, 2331.0] + - - [32, 2048, 1, 2048, 32, 32, 32, 2048] + - [261, 2390.0] + - - [2, 2048, 1, 10, 2, 2, 2, 10] + - [142, 21.0] + - - [4, 2048, 1, 512, 4, 4, 4, 512] + - [259, 335.0] + - - [4, 2048, 1, 10, 4, 4, 4, 10] + - [258, 20.0] + - - [16, 2048, 1, 100, 16, 16, 16, 100] + - [137, 559.0] + - - [4, 2000, 1, 500, 4, 4, 4, 500] + - [189, 317.0] + - - [10, 2000, 1, 500, 10, 10, 10, 500] + - [253, 873.0] + - - [32, 2000, 1, 512, 32, 32, 32, 512] + - [255, 2056.0] + - - [2, 2000, 1, 1024, 2, 2, 2, 1024] + - [259, 230.0] + - - [2, 2000, 1, 512, 2, 2, 2, 512] + - [247, 209.0] + - - [4, 2048, 1, 1024, 4, 4, 4, 1024] + - [247, 407.0] + - - [8, 2048, 1, 500, 8, 8, 8, 500] + - [189, 645.0] + - - [4, 2048, 1, 2000, 4, 4, 4, 2000] + - [163, 430.0] + - - [8, 2000, 1, 2000, 8, 8, 8, 2000] + - [176, 838.0] + - - [4, 2000, 1, 1024, 4, 4, 4, 1024] + - [252, 462.0] + - - [32, 2000, 1, 100, 32, 32, 32, 100] + - [246, 1584.0] + - - [2, 2048, 1, 100, 2, 2, 2, 100] + - [123, 122.0] + - - [8, 2048, 1, 10, 8, 8, 8, 10] + - [257, 41.0] + - - [2, 2048, 1, 2048, 2, 2, 2, 2048] + - [137, 218.0] + - - [10, 2000, 1, 2048, 10, 10, 10, 2048] + - [194, 1013.0] + - - [16, 2048, 1, 2000, 16, 16, 16, 2000] + - [189, 1708.0] + - - [10, 2048, 1, 512, 10, 10, 10, 512] + - [137, 824.0] + - - [16, 2048, 1, 512, 16, 16, 16, 512] + - [137, 1340.0] + - - [2, 2000, 1, 10, 2, 2, 2, 10] + - [123, 10.0] + - - [4, 2000, 1, 100, 4, 4, 4, 100] + - [137, 143.0] + - - [16, 2000, 1, 512, 16, 16, 16, 512] + - [137, 1307.0] + - - [32, 2048, 1, 10, 32, 32, 32, 10] + - [125, 356.0] + - - [10, 2048, 1, 10, 10, 10, 10, 10] + - [142, 118.0] + - - [4, 2000, 1, 512, 4, 4, 4, 512] + - [247, 417.0] + - - [16, 2048, 1, 10, 16, 16, 16, 10] + - [127, 207.0] + - - [32, 2000, 1, 10, 32, 32, 32, 10] + - [127, 372.0] + - - [10, 2000, 1, 100, 10, 10, 10, 100] + - [129, 565.0] + - - [2, 2048, 1, 500, 2, 2, 2, 500] + - [137, 202.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [262, 147.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [264, 2328.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [268, 295.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [269, 117.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [268, 1176.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [268, 584.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [263, 1825.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [265, 233.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [265, 916.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [269, 464.0] + - - [1024, 20, 1, 30522, 1024, 1024, 1024, 30522] + - [267, 2351.0] + - - [49, 512, 1, 4608, 49, 49, 49, 4608] + - [266, 2552.0] + - - [64, 512, 1, 1, 64, 64, 64, 1] + - [128, 18.0] + - - [1024, 32, 1, 2, 1024, 1024, 1024, 2] + - [132, 46.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [157, 1796.0] + - - [768, 32, 1, 768, 768, 768, 768, 768] + - [183, 1398.0] + - - [768, 32, 1, 2, 768, 768, 768, 2] + - [125, 13.0] + - - [768, 64, 1, 768, 768, 768, 768, 768] + - [152, 2070.0] + - - [768, 64, 1, 2, 768, 768, 768, 2] + - [152, 26.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [157, 1161.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 2813.0] + - - [32, 200, 1, 1, 32, 32, 32, 1] + - [123, 2.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 268.0] + - - [1024, 4, 1, 2, 1024, 1024, 1024, 2] + - [123, 2.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [199, 755.0] + - - [768, 16, 1, 2, 768, 768, 768, 2] + - [125, 7.0] + - - [768, 8, 1, 768, 768, 768, 768, 768] + - [161, 370.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 404.0] + - - [1024, 6, 1, 2, 1024, 1024, 1024, 2] + - [123, 3.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [161, 531.0] + - - [4, 704, 1, 1280, 4, 4, 4, 1280] + - [169, 193.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [143, 109.0] + - - [64, 4, 1, 256, 64, 64, 64, 256] + - [129, 9.0] + - - [64, 704, 1, 128, 64, 64, 64, 128] + - [191, 930.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [183, 1833.0] + - - [128, 4, 1, 1280, 128, 128, 128, 1280] + - [130, 38.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [191, 1067.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [166, 2962.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [190, 2128.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [188, 313.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 128] + - [145, 1362.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [146, 3008.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [130, 10.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [172, 2706.0] + - - [704, 4, 1, 1280, 704, 704, 704, 1280] + - [148, 199.0] + - - [64, 256, 1, 128, 64, 64, 64, 128] + - [142, 636.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [158, 3009.0] + - - [64, 1024, 1, 128, 64, 64, 64, 128] + - [192, 1263.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [171, 1883.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [191, 1980.0] + - - [448, 4, 1, 256, 448, 448, 448, 256] + - [159, 89.0] + - - [256, 4, 1, 1280, 256, 256, 256, 1280] + - [161, 77.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [130, 846.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [135, 383.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [130, 27.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [178, 2278.0] + - - [256, 4, 1, 256, 256, 256, 256, 256] + - [156, 54.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [135, 1309.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 256] + - [167, 166.0] + - - [4, 704, 1, 256, 4, 4, 4, 256] + - [185, 110.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [178, 2087.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [192, 1642.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [184, 1794.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [143, 2182.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [127, 701.0] + - - [4, 448, 1, 128, 4, 4, 4, 128] + - [163, 41.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [188, 820.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [130, 1412.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [127, 1446.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [129, 5.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 3328] + - [135, 357.0] + - - [4, 4, 1, 256, 4, 4, 4, 256] + - [125, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [130, 597.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [139, 1733.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [132, 632.0] + - - [4, 448, 1, 3328, 4, 4, 4, 3328] + - [157, 154.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [192, 1824.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [165, 1323.0] + - - [704, 64, 1, 128, 704, 704, 704, 128] + - [125, 930.0] + - - [448, 4, 1, 1280, 448, 448, 448, 1280] + - [161, 134.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [148, 106.0] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [130, 1377.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [190, 1592.0] + - - [448, 64, 1, 128, 448, 448, 448, 128] + - [171, 642.0] + - - [4, 448, 1, 256, 4, 4, 4, 256] + - [135, 85.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [139, 2249.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [140, 1808.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 3328] + - [142, 307.0] + - - [4, 704, 1, 128, 4, 4, 4, 128] + - [137, 63.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [165, 192.0] + - - [704, 4, 1, 128, 704, 704, 704, 128] + - [132, 62.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [157, 2117.0] + - - [448, 4, 1, 3328, 448, 448, 448, 3328] + - [135, 165.0] + - - [256, 4, 1, 3328, 256, 256, 256, 3328] + - [135, 96.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [139, 38.0] + - - [4, 64, 1, 1280, 4, 4, 4, 1280] + - [130, 19.0] + - - [4, 4, 1, 128, 4, 4, 4, 128] + - [197, 0.36] + - - [4, 128, 1, 256, 4, 4, 4, 256] + - [137, 19.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [126, 2729.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [159, 1176.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [130, 1228.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [183, 1415.0] + - - [64, 4, 1, 128, 64, 64, 64, 128] + - [129, 6.0] + - - [256, 64, 1, 128, 256, 256, 256, 128] + - [173, 394.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [195, 166.0] + - - [4, 704, 1, 3328, 4, 4, 4, 3328] + - [129, 222.0] + - - [4, 4, 1, 1280, 4, 4, 4, 1280] + - [123, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [132, 388.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 128] + - [157, 95.0] + - - [4, 64, 1, 128, 4, 4, 4, 128] + - [125, 6.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [199, 716.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [142, 1269.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [129, 56.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [195, 330.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 1280] + - [135, 289.0] + - - [35, 700, 1, 2048, 35, 35, 35, 2048] + - [142, 1243.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [168, 1442.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [190, 2667.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [188, 773.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [191, 1031.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [157, 832.0] + - - [4, 256, 1, 128, 4, 4, 4, 128] + - [137, 23.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [169, 441.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [126, 1678.0] + - - [4, 4, 1, 3328, 4, 4, 4, 3328] + - [123, 1.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1280] + - [194, 273.0] + - - [704, 4, 1, 256, 704, 704, 704, 256] + - [130, 102.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [195, 311.0] + - - [128, 4, 1, 3328, 128, 128, 128, 3328] + - [135, 48.0] + - - [128, 4, 1, 128, 128, 128, 128, 128] + - [123, 17.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [129, 9.0] + - - [4, 128, 1, 3328, 4, 4, 4, 3328] + - [135, 48.0] + - - [256, 256, 1, 128, 256, 256, 256, 128] + - [140, 1188.0] + - - [704, 4, 1, 3328, 704, 704, 704, 3328] + - [161, 233.0] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [138, 2502.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [131, 3086.0] + - - [256, 4, 1, 128, 256, 256, 256, 128] + - [125, 22.0] + - - [4, 1024, 1, 128, 4, 4, 4, 128] + - [189, 90.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [157, 1423.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [140, 1129.0] + - - [128, 256, 1, 128, 128, 128, 128, 128] + - [159, 711.0] + - - [128, 4, 1, 256, 128, 128, 128, 256] + - [142, 19.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [192, 2981.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [130, 1885.0] + - - [4, 448, 1, 1280, 4, 4, 4, 1280] + - [143, 131.0] + - - [448, 4, 1, 128, 448, 448, 448, 128] + - [137, 39.0] + - - [4, 256, 1, 3328, 4, 4, 4, 3328] + - [134, 94.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [123, 17.0] + - - [4, 256, 1, 1280, 4, 4, 4, 1280] + - [143, 78.0] + - - [64, 4, 1, 3328, 64, 64, 64, 3328] + - [134, 24.0] + - - [4, 64, 1, 3328, 4, 4, 4, 3328] + - [134, 24.0] + - - [35, 700, 1, 2560, 35, 35, 35, 2560] + - [129, 1264.0] + - - [4, 1024, 1, 256, 4, 4, 4, 256] + - [123, 149.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [139, 628.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [135, 208.0] + - - [4, 64, 1, 256, 4, 4, 4, 256] + - [123, 11.0] + - - [128, 448, 1, 128, 128, 128, 128, 128] + - [178, 1119.0] + - - [64, 448, 1, 128, 64, 64, 64, 128] + - [139, 644.0] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [126, 2723.0] + - - [4, 128, 1, 1280, 4, 4, 4, 1280] + - [134, 38.0] + - - [128, 64, 1, 128, 128, 128, 128, 128] + - [171, 187.0] + - - [64, 64, 1, 128, 64, 64, 64, 128] + - [183, 162.0] + - - [64, 4, 1, 1280, 64, 64, 64, 1280] + - [135, 20.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [129, 52.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [143, 620.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [153, 1522.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [140, 2299.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [126, 2423.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [140, 1067.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [192, 767.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [192, 2400.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [166, 989.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [192, 3923.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [126, 1435.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [126, 3063.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [140, 1222.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [179, 2058.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [126, 3638.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [126, 1827.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [153, 2156.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [179, 4259.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [153, 2570.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [153, 3426.0] + - - [8, 500, 1, 512, 8, 8, 8, 512] + - [195, 290.0] + - - [32, 512, 1, 512, 32, 32, 32, 512] + - [171, 1209.0] + - - [8, 512, 1, 500, 8, 8, 8, 500] + - [130, 212.0] + - - [8, 500, 1, 1024, 8, 8, 8, 1024] + - [171, 279.0] + - - [64, 1024, 1, 100, 64, 64, 64, 100] + - [125, 1047.0] + - - [64, 1024, 1, 500, 64, 64, 64, 500] + - [166, 2282.0] + - - [64, 1024, 1, 1024, 64, 64, 64, 1024] + - [166, 2941.0] + - - [2, 500, 1, 2048, 2, 2, 2, 2048] + - [195, 81.0] + - - [16, 512, 1, 10, 16, 16, 16, 10] + - [127, 21.0] + - - [8, 512, 1, 10, 8, 8, 8, 10] + - [123, 10.0] + - - [16, 500, 1, 2048, 16, 16, 16, 2048] + - [195, 636.0] + - - [10, 100, 1, 500, 10, 10, 10, 500] + - [129, 52.0] + - - [16, 100, 1, 10, 16, 16, 16, 10] + - [136, 7.0] + - - [2, 100, 1, 2000, 2, 2, 2, 2000] + - [134, 17.0] + - - [256, 100, 1, 2048, 256, 256, 256, 2048] + - [130, 1796.0] + - - [2, 512, 1, 512, 2, 2, 2, 512] + - [143, 54.0] + - - [2, 100, 1, 10, 2, 2, 2, 10] + - [132, 0.5] + - - [200, 100, 1, 100, 200, 200, 200, 100] + - [137, 364.0] + - - [500, 100, 1, 100, 500, 500, 500, 100] + - [178, 817.0] + - - [4, 100, 1, 10, 4, 4, 4, 10] + - [123, 1.0] + - - [32, 100, 1, 512, 32, 32, 32, 512] + - [142, 172.0] + - - [16, 1024, 1, 512, 16, 16, 16, 512] + - [169, 824.0] + - - [4, 1024, 1, 1024, 4, 4, 4, 1024] + - [169, 256.0] + - - [4, 512, 1, 10, 4, 4, 4, 10] + - [123, 5.0] + - - [128, 100, 1, 10, 128, 128, 128, 10] + - [132, 32.0] + - - [4, 512, 1, 2048, 4, 4, 4, 2048] + - [143, 164.0] + - - [10, 1024, 1, 2000, 10, 10, 10, 2000] + - [181, 712.0] + - - [256, 100, 1, 100, 256, 256, 256, 100] + - [152, 452.0] + - - [64, 1024, 1, 2048, 64, 64, 64, 2048] + - [172, 2874.0] + - - [16, 1024, 1, 100, 16, 16, 16, 100] + - [186, 297.0] + - - [32, 1024, 1, 1024, 32, 32, 32, 1024] + - [189, 1686.0] + - - [8, 100, 1, 500, 8, 8, 8, 500] + - [195, 42.0] + - - [10, 512, 1, 512, 10, 10, 10, 512] + - [171, 270.0] + - - [8, 500, 1, 10, 8, 8, 8, 10] + - [123, 10.0] + - - [16, 1024, 1, 10, 16, 16, 16, 10] + - [152, 40.0] + - - [16, 512, 1, 2048, 16, 16, 16, 2048] + - [169, 645.0] + - - [128, 512, 1, 2048, 128, 128, 128, 2048] + - [144, 2878.0] + - - [128, 512, 1, 100, 128, 128, 128, 100] + - [152, 1021.0] + - - [64, 500, 1, 2048, 64, 64, 64, 2048] + - [139, 1766.0] + - - [500, 100, 1, 10, 500, 500, 500, 10] + - [170, 210.0] + - - [64, 100, 1, 2048, 64, 64, 64, 2048] + - [135, 543.0] + - - [64, 100, 1, 10, 64, 64, 64, 10] + - [129, 17.0] + - - [16, 512, 1, 500, 16, 16, 16, 500] + - [157, 423.0] + - - [200, 100, 1, 2000, 200, 200, 200, 2000] + - [157, 1415.0] + - - [2, 100, 1, 512, 2, 2, 2, 512] + - [129, 11.0] + - - [32, 512, 1, 100, 32, 32, 32, 100] + - [189, 306.0] + - - [16, 512, 1, 1024, 16, 16, 16, 1024] + - [195, 570.0] + - - [4, 1024, 1, 512, 4, 4, 4, 512] + - [169, 208.0] + - - [2, 500, 1, 500, 2, 2, 2, 500] + - [143, 52.0] + - - [32, 100, 1, 100, 32, 32, 32, 100] + - [163, 60.0] + - - [100, 500, 1, 2000, 100, 100, 100, 2000] + - [126, 2222.0] + - - [10, 512, 1, 10, 10, 10, 10, 10] + - [125, 13.0] + - - [100, 500, 1, 2048, 100, 100, 100, 2048] + - [144, 2188.0] + - - [2, 100, 1, 1024, 2, 2, 2, 1024] + - [130, 14.0] + - - [32, 512, 1, 1024, 32, 32, 32, 1024] + - [143, 1138.0] + - - [256, 100, 1, 1024, 256, 256, 256, 1024] + - [130, 1570.0] + - - [128, 100, 1, 100, 128, 128, 128, 100] + - [189, 245.0] + - - [32, 512, 1, 10, 32, 32, 32, 10] + - [129, 41.0] + - - [128, 100, 1, 1024, 128, 128, 128, 1024] + - [195, 899.0] + - - [16, 500, 1, 2000, 16, 16, 16, 2000] + - [130, 613.0] + - - [64, 500, 1, 500, 64, 64, 64, 500] + - [157, 1322.0] + - - [128, 512, 1, 1024, 128, 128, 128, 1024] + - [140, 2933.0] + - - [128, 512, 1, 2000, 128, 128, 128, 2000] + - [126, 3020.0] + - - [2, 512, 1, 10, 2, 2, 2, 10] + - [127, 3.0] + - - [10, 512, 1, 500, 10, 10, 10, 500] + - [169, 324.0] + - - [4, 1024, 1, 2000, 4, 4, 4, 2000] + - [129, 287.0] + - - [256, 100, 1, 2000, 256, 256, 256, 2000] + - [130, 1796.0] + - - [100, 100, 1, 10, 100, 100, 100, 10] + - [132, 25.0] + - - [128, 512, 1, 10, 128, 128, 128, 10] + - [154, 161.0] + - - [256, 100, 1, 500, 256, 256, 256, 500] + - [157, 1250.0] + - - [64, 100, 1, 512, 64, 64, 64, 512] + - [183, 450.0] + - - [64, 512, 1, 500, 64, 64, 64, 500] + - [165, 1679.0] + - - [8, 100, 1, 512, 8, 8, 8, 512] + - [171, 57.0] + - - [32, 100, 1, 500, 32, 32, 32, 500] + - [157, 220.0] + - - [32, 500, 1, 2048, 32, 32, 32, 2048] + - [143, 1275.0] + - - [128, 500, 1, 2000, 128, 128, 128, 2000] + - [126, 2893.0] + - - [8, 1024, 1, 10, 8, 8, 8, 10] + - [132, 21.0] + - - [2, 500, 1, 100, 2, 2, 2, 100] + - [132, 19.0] + - - [10, 500, 1, 512, 10, 10, 10, 512] + - [171, 267.0] + - - [32, 500, 1, 500, 32, 32, 32, 500] + - [157, 826.0] + - - [100, 500, 1, 100, 100, 100, 100, 100] + - [125, 796.0] + - - [10, 1024, 1, 512, 10, 10, 10, 512] + - [194, 520.0] + - - [512, 100, 1, 512, 512, 512, 512, 512] + - [153, 1928.0] + - - [4, 500, 1, 500, 4, 4, 4, 500] + - [130, 104.0] + - - [64, 100, 1, 1024, 64, 64, 64, 1024] + - [187, 486.0] + - - [2, 500, 1, 2000, 2, 2, 2, 2000] + - [130, 84.0] + - - [32, 512, 1, 2048, 32, 32, 32, 2048] + - [195, 1412.0] + - - [10, 100, 1, 2000, 10, 10, 10, 2000] + - [160, 93.0] + - - [4, 100, 1, 512, 4, 4, 4, 512] + - [161, 27.0] + - - [2, 512, 1, 2048, 2, 2, 2, 2048] + - [142, 89.0] + - - [100, 100, 1, 2000, 100, 100, 100, 2000] + - [130, 850.0] + - - [10, 500, 1, 500, 10, 10, 10, 500] + - [157, 339.0] + - - [2, 100, 1, 2048, 2, 2, 2, 2048] + - [130, 17.0] + - - [32, 100, 1, 2048, 32, 32, 32, 2048] + - [188, 273.0] + - - [16, 100, 1, 1024, 16, 16, 16, 1024] + - [142, 130.0] + - - [2, 500, 1, 10, 2, 2, 2, 10] + - [135, 5.0] + - - [500, 100, 1, 2048, 500, 500, 500, 2048] + - [172, 2287.0] + - - [16, 1024, 1, 2000, 16, 16, 16, 2000] + - [129, 1160.0] + - - [10, 1024, 1, 1024, 10, 10, 10, 1024] + - [194, 642.0] + - - [500, 100, 1, 512, 500, 500, 500, 512] + - [166, 1775.0] + - - [32, 512, 1, 500, 32, 32, 32, 500] + - [195, 1116.0] + - - [100, 500, 1, 512, 100, 100, 100, 512] + - [184, 1942.0] + - - [8, 500, 1, 2000, 8, 8, 8, 2000] + - [157, 306.0] + - - [4, 100, 1, 1024, 4, 4, 4, 1024] + - [130, 28.0] + - - [2, 500, 1, 1024, 2, 2, 2, 1024] + - [195, 71.0] + - - [100, 500, 1, 1024, 100, 100, 100, 1024] + - [192, 2213.0] + - - [32, 100, 1, 1024, 32, 32, 32, 1024] + - [188, 266.0] + - - [64, 100, 1, 2000, 64, 64, 64, 2000] + - [188, 599.0] + - - [64, 500, 1, 10, 64, 64, 64, 10] + - [159, 174.0] + - - [64, 500, 1, 512, 64, 64, 64, 512] + - [165, 1754.0] + - - [10, 100, 1, 1024, 10, 10, 10, 1024] + - [148, 82.0] + - - [16, 512, 1, 100, 16, 16, 16, 100] + - [124, 175.0] + - - [4, 100, 1, 2000, 4, 4, 4, 2000] + - [135, 34.0] + - - [2, 512, 1, 1024, 2, 2, 2, 1024] + - [171, 71.0] + - - [64, 512, 1, 1024, 64, 64, 64, 1024] + - [191, 1731.0] + - - [512, 100, 1, 2048, 512, 512, 512, 2048] + - [184, 2254.0] + - - [32, 100, 1, 2000, 32, 32, 32, 2000] + - [135, 269.0] + - - [4, 512, 1, 500, 4, 4, 4, 500] + - [123, 106.0] + - - [4, 500, 1, 1024, 4, 4, 4, 1024] + - [143, 137.0] + - - [32, 100, 1, 10, 32, 32, 32, 10] + - [140, 13.0] + - - [10, 1024, 1, 2048, 10, 10, 10, 2048] + - [169, 793.0] + - - [8, 500, 1, 100, 8, 8, 8, 100] + - [125, 75.0] + - - [200, 100, 1, 1024, 200, 200, 200, 1024] + - [130, 1228.0] + - - [16, 100, 1, 100, 16, 16, 16, 100] + - [181, 30.0] + - - [8, 1024, 1, 2000, 8, 8, 8, 2000] + - [129, 571.0] + - - [4, 512, 1, 100, 4, 4, 4, 100] + - [191, 38.0] + - - [16, 500, 1, 100, 16, 16, 16, 100] + - [200, 185.0] + - - [8, 1024, 1, 2048, 8, 8, 8, 2048] + - [194, 586.0] + - - [16, 1024, 1, 2048, 16, 16, 16, 2048] + - [142, 1181.0] + - - [64, 512, 1, 100, 64, 64, 64, 100] + - [181, 602.0] + - - [2, 100, 1, 500, 2, 2, 2, 500] + - [129, 10.0] + - - [2, 500, 1, 512, 2, 2, 2, 512] + - [143, 54.0] + - - [128, 500, 1, 1024, 128, 128, 128, 1024] + - [140, 2840.0] + - - [10, 100, 1, 10, 10, 10, 10, 10] + - [130, 3.0] + - - [64, 1024, 1, 10, 64, 64, 64, 10] + - [154, 165.0] + - - [500, 100, 1, 500, 500, 500, 500, 500] + - [184, 1673.0] + - - [2, 512, 1, 100, 2, 2, 2, 100] + - [125, 19.0] + - - [16, 100, 1, 500, 16, 16, 16, 500] + - [161, 104.0] + - - [128, 100, 1, 500, 128, 128, 128, 500] + - [132, 814.0] + - - [512, 100, 1, 1024, 512, 512, 512, 1024] + - [192, 2248.0] + - - [16, 100, 1, 2000, 16, 16, 16, 2000] + - [143, 139.0] + - - [10, 512, 1, 100, 10, 10, 10, 100] + - [134, 129.0] + - - [8, 512, 1, 100, 8, 8, 8, 100] + - [162, 97.0] + - - [128, 100, 1, 2000, 128, 128, 128, 2000] + - [157, 1014.0] + - - [2, 1024, 1, 2000, 2, 2, 2, 2000] + - [169, 154.0] + - - [100, 512, 1, 512, 100, 100, 100, 512] + - [140, 1875.0] + - - [32, 1024, 1, 2000, 32, 32, 32, 2000] + - [130, 1799.0] + - - [128, 500, 1, 100, 128, 128, 128, 100] + - [165, 1013.0] + - - [100, 100, 1, 100, 100, 100, 100, 100] + - [169, 185.0] + - - [8, 512, 1, 1024, 8, 8, 8, 1024] + - [171, 280.0] + - - [200, 100, 1, 500, 200, 200, 200, 500] + - [183, 973.0] + - - [2, 1024, 1, 2048, 2, 2, 2, 2048] + - [169, 147.0] + - - [512, 100, 1, 2000, 512, 512, 512, 2000] + - [184, 2247.0] + - - [16, 512, 1, 2000, 16, 16, 16, 2000] + - [130, 625.0] + - - [64, 500, 1, 1024, 64, 64, 64, 1024] + - [191, 1674.0] + - - [10, 512, 1, 1024, 10, 10, 10, 1024] + - [143, 351.0] + - - [512, 100, 1, 100, 512, 512, 512, 100] + - [152, 842.0] + - - [8, 100, 1, 1024, 8, 8, 8, 1024] + - [135, 56.0] + - - [10, 100, 1, 100, 10, 10, 10, 100] + - [127, 18.0] + - - [10, 500, 1, 2000, 10, 10, 10, 2000] + - [173, 399.0] + - - [500, 100, 1, 2000, 500, 500, 500, 2000] + - [144, 2296.0] + - - [100, 512, 1, 2000, 100, 100, 100, 2000] + - [179, 2402.0] + - - [64, 1024, 1, 512, 64, 64, 64, 512] + - [172, 2601.0] + - - [32, 500, 1, 100, 32, 32, 32, 100] + - [157, 286.0] + - - [10, 100, 1, 2048, 10, 10, 10, 2048] + - [175, 85.0] + - - [64, 100, 1, 100, 64, 64, 64, 100] + - [137, 116.0] + - - [2, 1024, 1, 100, 2, 2, 2, 100] + - [127, 37.0] + - - [64, 500, 1, 2000, 64, 64, 64, 2000] + - [165, 1788.0] + - - [8, 512, 1, 512, 8, 8, 8, 512] + - [195, 291.0] + - - [8, 512, 1, 2048, 8, 8, 8, 2048] + - [143, 354.0] + - - [100, 100, 1, 1024, 100, 100, 100, 1024] + - [195, 842.0] + - - [8, 100, 1, 2000, 8, 8, 8, 2000] + - [160, 75.0] + - - [2, 1024, 1, 1024, 2, 2, 2, 1024] + - [137, 143.0] + - - [16, 512, 1, 512, 16, 16, 16, 512] + - [169, 442.0] + - - [32, 500, 1, 512, 32, 32, 32, 512] + - [171, 838.0] + - - [32, 500, 1, 1024, 32, 32, 32, 1024] + - [195, 1092.0] + - - [32, 500, 1, 10, 32, 32, 32, 10] + - [125, 42.0] + - - [4, 1024, 1, 500, 4, 4, 4, 500] + - [169, 198.0] + - - [256, 100, 1, 512, 256, 256, 256, 512] + - [171, 1263.0] + - - [8, 1024, 1, 500, 8, 8, 8, 500] + - [129, 399.0] + - - [4, 1024, 1, 100, 4, 4, 4, 100] + - [137, 74.0] + - - [100, 500, 1, 500, 100, 100, 100, 500] + - [192, 1751.0] + - - [2, 1024, 1, 500, 2, 2, 2, 500] + - [129, 131.0] + - - [64, 100, 1, 500, 64, 64, 64, 500] + - [195, 344.0] + - - [2, 512, 1, 500, 2, 2, 2, 500] + - [130, 52.0] + - - [10, 1024, 1, 500, 10, 10, 10, 500] + - [194, 501.0] + - - [128, 500, 1, 512, 128, 128, 128, 512] + - [138, 2588.0] + - - [10, 500, 1, 2048, 10, 10, 10, 2048] + - [195, 441.0] + - - [128, 512, 1, 512, 128, 128, 128, 512] + - [164, 2693.0] + - - [64, 512, 1, 10, 64, 64, 64, 10] + - [145, 184.0] + - - [32, 500, 1, 2000, 32, 32, 32, 2000] + - [161, 1330.0] + - - [100, 100, 1, 2048, 100, 100, 100, 2048] + - [171, 885.0] + - - [200, 100, 1, 512, 200, 200, 200, 512] + - [163, 1286.0] + - - [200, 100, 1, 2048, 200, 200, 200, 2048] + - [183, 1527.0] + - - [8, 100, 1, 10, 8, 8, 8, 10] + - [194, 5.0] + - - [100, 100, 1, 500, 100, 100, 100, 500] + - [171, 706.0] + - - [100, 500, 1, 10, 100, 100, 100, 10] + - [127, 269.0] + - - [10, 500, 1, 1024, 10, 10, 10, 1024] + - [195, 413.0] + - - [256, 100, 1, 10, 256, 256, 256, 10] + - [178, 142.0] + - - [10, 512, 1, 2048, 10, 10, 10, 2048] + - [195, 452.0] + - - [2, 1024, 1, 512, 2, 2, 2, 512] + - [142, 136.0] + - - [4, 500, 1, 2048, 4, 4, 4, 2048] + - [143, 177.0] + - - [100, 512, 1, 100, 100, 100, 100, 100] + - [176, 1320.0] + - - [16, 500, 1, 512, 16, 16, 16, 512] + - [195, 571.0] + - - [10, 1024, 1, 100, 10, 10, 10, 100] + - [157, 308.0] + - - [8, 1024, 1, 100, 8, 8, 8, 100] + - [137, 253.0] + - - [64, 1024, 1, 2000, 64, 64, 64, 2000] + - [184, 3038.0] + - - [10, 100, 1, 512, 10, 10, 10, 512] + - [129, 53.0] + - - [4, 500, 1, 2000, 4, 4, 4, 2000] + - [130, 152.0] + - - [4, 100, 1, 100, 4, 4, 4, 100] + - [123, 7.0] + - - [32, 1024, 1, 512, 32, 32, 32, 512] + - [139, 1415.0] + - - [8, 512, 1, 2000, 8, 8, 8, 2000] + - [157, 312.0] + - - [100, 100, 1, 512, 100, 100, 100, 512] + - [195, 540.0] + - - [2, 512, 1, 2000, 2, 2, 2, 2000] + - [130, 78.0] + - - [16, 500, 1, 10, 16, 16, 16, 10] + - [132, 44.0] + - - [10, 500, 1, 100, 10, 10, 10, 100] + - [152, 150.0] + - - [4, 100, 1, 500, 4, 4, 4, 500] + - [129, 21.0] + - - [64, 500, 1, 100, 64, 64, 64, 100] + - [189, 576.0] + - - [2, 100, 1, 100, 2, 2, 2, 100] + - [123, 4.0] + - - [10, 512, 1, 2000, 10, 10, 10, 2000] + - [199, 424.0] + - - [8, 500, 1, 500, 8, 8, 8, 500] + - [130, 270.0] + - - [4, 500, 1, 512, 4, 4, 4, 512] + - [195, 143.0] + - - [10, 500, 1, 10, 10, 10, 10, 10] + - [127, 29.0] + - - [64, 512, 1, 2000, 64, 64, 64, 2000] + - [139, 1960.0] + - - [32, 512, 1, 2000, 32, 32, 32, 2000] + - [195, 1406.0] + - - [128, 500, 1, 2048, 128, 128, 128, 2048] + - [140, 3010.0] + - - [4, 512, 1, 512, 4, 4, 4, 512] + - [195, 147.0] + - - [16, 500, 1, 1024, 16, 16, 16, 1024] + - [195, 664.0] + - - [10, 1024, 1, 10, 10, 10, 10, 10] + - [129, 62.0] + - - [16, 500, 1, 500, 16, 16, 16, 500] + - [195, 410.0] + - - [500, 100, 1, 1024, 500, 500, 500, 1024] + - [166, 2445.0] + - - [16, 100, 1, 512, 16, 16, 16, 512] + - [143, 86.0] + - - [64, 512, 1, 2048, 64, 64, 64, 2048] + - [184, 1821.0] + - - [32, 1024, 1, 10, 32, 32, 32, 10] + - [129, 191.0] + - - [8, 1024, 1, 512, 8, 8, 8, 512] + - [142, 421.0] + - - [4, 1024, 1, 2048, 4, 4, 4, 2048] + - [169, 295.0] + - - [128, 500, 1, 500, 128, 128, 128, 500] + - [140, 2689.0] + - - [100, 512, 1, 1024, 100, 100, 100, 1024] + - [140, 2624.0] + - - [16, 1024, 1, 500, 16, 16, 16, 500] + - [129, 1058.0] + - - [128, 100, 1, 2048, 128, 128, 128, 2048] + - [157, 1149.0] + - - [100, 512, 1, 500, 100, 100, 100, 500] + - [192, 1798.0] + - - [8, 1024, 1, 1024, 8, 8, 8, 1024] + - [142, 519.0] + - - [4, 500, 1, 10, 4, 4, 4, 10] + - [161, 11.0] + - - [128, 500, 1, 10, 128, 128, 128, 10] + - [132, 352.0] + - - [32, 1024, 1, 100, 32, 32, 32, 100] + - [139, 596.0] + - - [8, 500, 1, 2048, 8, 8, 8, 2048] + - [143, 321.0] + - - [16, 1024, 1, 1024, 16, 16, 16, 1024] + - [169, 1029.0] + - - [200, 100, 1, 10, 200, 200, 200, 10] + - [139, 105.0] + - - [512, 100, 1, 500, 512, 512, 512, 500] + - [153, 2000.0] + - - [4, 500, 1, 100, 4, 4, 4, 100] + - [157, 63.0] + - - [8, 100, 1, 2048, 8, 8, 8, 2048] + - [134, 74.0] + - - [512, 100, 1, 10, 512, 512, 512, 10] + - [127, 122.0] + - - [4, 512, 1, 1024, 4, 4, 4, 1024] + - [143, 140.0] + - - [32, 1024, 1, 2048, 32, 32, 32, 2048] + - [195, 1923.0] + - - [128, 100, 1, 512, 128, 128, 128, 512] + - [191, 840.0] + - - [32, 1024, 1, 500, 32, 32, 32, 500] + - [183, 1679.0] + - - [4, 1024, 1, 10, 4, 4, 4, 10] + - [187, 23.0] + - - [100, 512, 1, 10, 100, 100, 100, 10] + - [152, 267.0] + - - [8, 100, 1, 100, 8, 8, 8, 100] + - [195, 24.0] + - - [128, 512, 1, 500, 128, 128, 128, 500] + - [177, 2601.0] + - - [16, 100, 1, 2048, 16, 16, 16, 2048] + - [134, 149.0] + - - [2, 1024, 1, 10, 2, 2, 2, 10] + - [123, 5.0] + - - [4, 100, 1, 2048, 4, 4, 4, 2048] + - [134, 34.0] + - - [4, 512, 1, 2000, 4, 4, 4, 2000] + - [130, 156.0] + - - [1024, 29, 1, 1024, 1024, 1024, 1024, 1024] + - [195, 1475.0] + - - [1024, 1, 1, 21, 1024, 1024, 1024, 21] + - [147, 10.0] + - - [1024, 49, 1, 1024, 1024, 1024, 1024, 1024] + - [184, 2002.0] + - - [1024, 35, 1, 1024, 1024, 1024, 1024, 1024] + - [152, 1748.0] + - - [1024, 24, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 1224.0] + - - [1024, 21, 1, 1024, 1024, 1024, 1024, 1024] + - [195, 1080.0] + - - [1024, 1, 1, 14, 1024, 1024, 1024, 14] + - [194, 8.0] + - - [1024, 91, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 3435.0] + - - [1024, 14, 1, 1024, 1024, 1024, 1024, 1024] + - [135, 928.0] + - - [1024, 25, 1, 1024, 1024, 1024, 1024, 1024] + - [125, 1285.0] + - - [1024, 27, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 1392.0] + - - [1024, 50, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 2165.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [198, 2817.0] + - - [1024, 13, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 996.0] + - - [1024, 63, 1, 1024, 1024, 1024, 1024, 1024] + - [186, 2721.0] + - - [1024, 86, 1, 1024, 1024, 1024, 1024, 1024] + - [140, 3110.0] + - - [1024, 1, 1, 13, 1024, 1024, 1024, 13] + - [157, 7.0] + - - [289, 192, 1, 1344, 289, 289, 289, 1344] + - [192, 2460.0] + - - [196, 128, 1, 800, 196, 196, 196, 800] + - [191, 1611.0] + - - [64, 512, 1, 1344, 64, 64, 64, 1344] + - [165, 1989.0] + - - [289, 224, 1, 1568, 289, 289, 289, 1568] + - [126, 2989.0] + - - [64, 256, 1, 1536, 64, 64, 64, 1536] + - [183, 1245.0] + - - [289, 160, 1, 1120, 289, 289, 289, 1120] + - [125, 2098.0] + - - [64, 256, 1, 1152, 64, 64, 64, 1152] + - [183, 1180.0] + - - [289, 224, 1, 1344, 289, 289, 289, 1344] + - [192, 2929.0] + - - [289, 192, 1, 896, 289, 289, 289, 896] + - [140, 2343.0] + - - [784, 16, 32, 192, 784, 784, 784, 192] + - [191, 2581.0] + - - [49, 128, 1, 1200, 49, 49, 49, 1200] + - [129, 484.0] + - - [289, 128, 1, 896, 289, 289, 289, 896] + - [150, 1901.0] + - - [1001, 32, 1, 1024, 1001, 1001, 1001, 1024] + - [169, 1622.0] + - - [64, 448, 1, 1152, 64, 64, 64, 1152] + - [157, 1849.0] + - - [1001, 32, 1, 2048, 1001, 1001, 1001, 2048] + - [158, 1815.0] + - - [289, 192, 1, 1120, 289, 289, 289, 1120] + - [192, 2464.0] + - - [64, 320, 1, 1728, 64, 64, 64, 1728] + - [129, 1593.0] + - - [289, 96, 1, 864, 289, 289, 289, 864] + - [163, 1662.0] + - - [196, 64, 1, 800, 196, 196, 196, 800] + - [195, 974.0] + - - [784, 32, 1, 400, 784, 784, 784, 400] + - [137, 1236.0] + - - [64, 320, 1, 2880, 64, 64, 64, 2880] + - [157, 1673.0] + - - [1001, 32, 1, 1536, 1001, 1001, 1001, 1536] + - [130, 1693.0] + - - [64, 384, 1, 1152, 64, 64, 64, 1152] + - [171, 1580.0] + - - [64, 192, 1, 1728, 64, 64, 64, 1728] + - [171, 972.0] + - - [1001, 64, 1, 1536, 1001, 1001, 1001, 1536] + - [144, 2699.0] + - - [1001, 64, 1, 2048, 1001, 1001, 1001, 2048] + - [166, 2809.0] + - - [1024, 64, 1, 4096, 1024, 1024, 1024, 4096] + - [146, 3132.0] + - - [64, 10, 448, 10, 64, 64, 64, 10] + - [144, 525.0] + - - [64, 18, 648, 18, 64, 64, 64, 18] + - [140, 1256.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [153, 1799.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [153, 1972.0] + - - [64, 21, 1472, 21, 64, 64, 64, 21] + - [126, 2205.0] + - - [64, 23, 64, 23, 64, 64, 64, 23] + - [129, 732.0] + - - [64, 26, 56, 26, 64, 64, 64, 26] + - [189, 503.0] + - - [1024, 1, 1, 2, 1024, 1024, 1024, 2] + - [123, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [142, 74.0] + - - [64, 27, 56, 26, 64, 64, 64, 26] + - [127, 925.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [174, 9.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [123, 26.0] + - - [64, 31, 1, 30, 64, 64, 64, 30] + - [146, 18.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [127, 14.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [123, 3.0] + - - [64, 14, 1, 15, 64, 64, 64, 15] + - [123, 3.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [156, 4.0] + - - [64, 15, 1, 17, 64, 64, 64, 17] + - [150, 9.0] + - - [100, 512, 1, 2048, 100, 100, 100, 2048] + - [144, 2219.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1600] + - [188, 84.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 200] + - [195, 49.0] + - - [1, 200, 1, 1, 1, 1, 1, 1] + - [152, 0.13] + - - [1, 512, 1, 1, 1, 1, 1, 1] + - [154, 0.33] + - - [67, 512, 1, 2048, 67, 67, 67, 2048] + - [189, 1813.0] + - - [74, 512, 1, 2048, 74, 74, 74, 2048] + - [189, 1862.0] + - - [64, 3, 512, 3, 64, 64, 64, 3] + - [123, 54.0] + - - [64, 5, 512, 5, 64, 64, 64, 5] + - [192, 146.0] + - - [64, 9, 512, 9, 64, 64, 64, 9] + - [179, 421.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [191, 1446.0] + - - [25, 128, 120, 256, 25, 25, 25, 256] + - [164, 3061.0] + - - [25, 128, 139, 256, 25, 25, 25, 256] + - [164, 2817.0] + - - [25, 128, 160, 256, 25, 25, 25, 256] + - [190, 3143.0] + - - [25, 128, 18, 256, 25, 25, 25, 256] + - [138, 1404.0] + - - [25, 128, 19, 256, 25, 25, 25, 256] + - [164, 1494.0] + - - [9, 128, 120, 256, 9, 9, 9, 256] + - [190, 1115.0] + - - [9, 128, 139, 256, 9, 9, 9, 256] + - [190, 1110.0] + - - [9, 128, 160, 256, 9, 9, 9, 256] + - [142, 1154.0] + - - [9, 128, 18, 256, 9, 9, 9, 256] + - [189, 635.0] + - - [9, 128, 19, 256, 9, 9, 9, 256] + - [137, 655.0] + - - [1, 256, 1, 1152, 1, 1, 1, 1152] + - [161, 22.0] + - - [100, 512, 1, 2304, 100, 100, 100, 2304] + - [126, 2305.0] + - - [25, 256, 1, 1152, 25, 25, 25, 1152] + - [194, 513.0] + - - [9, 256, 1, 1152, 9, 9, 9, 1152] + - [148, 193.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [153, 2682.0] + - - [1024, 10, 1, 2, 1024, 1024, 1024, 2] + - [123, 5.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [161, 667.0] + - - [1024, 39, 1, 2, 1024, 1024, 1024, 2] + - [157, 21.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [152, 1749.0] + - - [1024, 40, 1, 2, 1024, 1024, 1024, 2] + - [182, 28.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [152, 1799.0] + - - [1024, 41, 1, 2, 1024, 1024, 1024, 2] + - [152, 23.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [165, 1850.0] + - - [1024, 5, 1, 2, 1024, 1024, 1024, 2] + - [123, 3.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [135, 339.0] + - - [1024, 8, 1, 2, 1024, 1024, 1024, 2] + - [178, 5.0] + - - [1024, 9, 1, 2, 1024, 1024, 1024, 2] + - [123, 5.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 607.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [179, 347.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [153, 330.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [140, 2147.0] + - - [64, 14, 10880, 15, 64, 64, 64, 15] + - [166, 2097.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [152, 2447.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [166, 2208.0] + - - [64, 15, 7680, 17, 64, 64, 64, 17] + - [152, 2358.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [140, 2506.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [126, 2330.0] + - - [64, 17, 6144, 21, 64, 64, 64, 21] + - [153, 2693.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [153, 3230.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [166, 3840.0] + - - [64, 24, 4736, 34, 64, 64, 64, 34] + - [192, 3396.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [140, 4458.0] + - - [64, 31, 2048, 30, 64, 64, 64, 30] + - [158, 4104.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [153, 4764.0] + - - [128, 128, 1, 64, 128, 128, 128, 64] + - [137, 441.0] + - - [64, 5, 1, 5, 64, 64, 64, 5] + - [123, 1.0] + - - [32, 33, 1, 33, 32, 32, 32, 33] + - [123, 16.0] + - - [64, 5, 960, 5, 64, 64, 64, 5] + - [138, 197.0] + - - [74, 960, 1, 2048, 74, 74, 74, 2048] + - [140, 2583.0] + - - [128, 27, 32768, 27, 128, 128, 128, 27] + - [136, 1922.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [171, 1254.0] + - - [1024, 16, 1, 2, 1024, 1024, 1024, 2] + - [127, 22.0] + - - [1024, 64, 1, 2, 1024, 1024, 1024, 2] + - [152, 85.0] + - - [1024, 80, 1, 2, 1024, 1024, 1024, 2] + - [152, 104.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 3136.0] + - - [1024, 82, 1, 2, 1024, 1024, 1024, 2] + - [152, 108.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [175, 946.0] + - - [1024, 12, 1, 2, 1024, 1024, 1024, 2] + - [129, 16.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [179, 3612.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [126, 3670.0] + - - [196, 256, 1, 2304, 196, 196, 196, 2304] + - [126, 2474.0] + - - [850, 3, 2, 256, 850, 850, 850, 256] + - [157, 265.0] + - - [850, 12, 2, 256, 850, 850, 850, 256] + - [157, 1049.0] + - - [805, 12, 2, 256, 805, 805, 805, 256] + - [157, 997.0] + - - [805, 3, 2, 256, 805, 805, 805, 256] + - [183, 249.0] + - - [768, 3, 2, 256, 768, 768, 768, 256] + - [130, 233.0] + - - [768, 12, 2, 256, 768, 768, 768, 256] + - [195, 948.0] + - - [864, 12, 2, 256, 864, 864, 864, 256] + - [181, 976.0] + - - [864, 3, 2, 256, 864, 864, 864, 256] + - [157, 271.0] + - - [247, 3, 2, 256, 247, 247, 247, 256] + - [142, 81.0] + - - [216, 3, 2, 256, 216, 216, 216, 256] + - [157, 71.0] + - - [950, 3, 2, 256, 950, 950, 950, 256] + - [143, 266.0] + - - [187, 12, 2, 256, 187, 187, 187, 256] + - [188, 218.0] + - - [176, 12, 2, 256, 176, 176, 176, 256] + - [132, 210.0] + - - [247, 12, 2, 256, 247, 247, 247, 256] + - [197, 294.0] + - - [187, 3, 2, 256, 187, 187, 187, 256] + - [142, 61.0] + - - [228, 12, 2, 256, 228, 228, 228, 256] + - [160, 262.0] + - - [221, 12, 2, 256, 221, 221, 221, 256] + - [160, 255.0] + - - [176, 3, 2, 256, 176, 176, 176, 256] + - [183, 57.0] + - - [950, 12, 2, 256, 950, 950, 950, 256] + - [157, 1110.0] + - - [192, 12, 2, 256, 192, 192, 192, 256] + - [142, 252.0] + - - [228, 3, 2, 256, 228, 228, 228, 256] + - [183, 74.0] + - - [221, 3, 2, 256, 221, 221, 221, 256] + - [129, 71.0] + - - [192, 3, 2, 256, 192, 192, 192, 256] + - [183, 63.0] + - - [216, 12, 2, 256, 216, 216, 216, 256] + - [195, 276.0] + - - [2, 6, 1, 1024, 2, 2, 2, 1024] + - [123, 1.0] + - - [1024, 20, 1, 2, 1024, 1024, 1024, 2] + - [123, 10.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB_GB.yaml new file mode 100644 index 000000000..3a8e545d9 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Ailk_Bljk_SB_GB.yaml @@ -0,0 +1,73797 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x128x16_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 8 + LVCB: 2 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 32 + LVCB: 2 + LVPA: 1 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 + LVPA: 2 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x32_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x32_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x16_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x8x16_SN_SU0_SUM0_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 1 + LSPB: 2 + LVCA: 64 + LVCB: 32 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x32_SN_SU0_SUM0_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU0_SUM0_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 4 + MacroTileA: 128 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 32 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x4x32_SN_SU0_SUM0_TT4_1_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 1 + LSPB: 8 + LVCA: 64 + LVCB: 8 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 1 + LSPB: 4 + LVCA: 64 + LVCB: 16 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 128 + LVCB: 16 + LVPA: 1 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x8x32_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x32x8_SN_SU32_SUM3_TT1_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 2 + LSPB: 8 + LVCA: 32 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 2 + LSPB: 16 + LVCA: 64 + LVCB: 8 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11804.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12333.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11215.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12230.0] + - - [768, 4096, 1, 2, 768, 768, 768, 2] + - [3, 331.0] + - - [768, 4096, 1, 768, 768, 768, 768, 768] + - [3, 11423.0] + - - [3072, 4096, 1, 768, 3072, 3072, 3072, 768] + - [3, 12221.0] + - - [768, 2048, 1, 2, 768, 768, 768, 2] + - [1, 349.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [1, 10952.0] + - - [3072, 2048, 1, 768, 3072, 3072, 3072, 768] + - [1, 11817.0] + - - [3072, 1024, 1, 768, 3072, 3072, 3072, 768] + - [1, 11534.0] + - - [3072, 512, 1, 768, 3072, 3072, 3072, 768] + - [1, 11024.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11190.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 11891.0] + - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12365.0] + - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 11036.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12272.0] + - - [1024, 2048, 1, 2, 1024, 1024, 1024, 2] + - [7, 377.0] + - - [1024, 3072, 1, 2, 1024, 1024, 1024, 2] + - [8, 381.0] + - - [1024, 4096, 1, 2, 1024, 1024, 1024, 2] + - [1, 373.0] + - - [128, 128, 512, 64, 128, 128, 128, 64] + - [30, 10682.0] + - - [512, 512, 64, 64, 512, 512, 512, 64] + - [0, 11343.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12103.0] + - - [2368, 5888, 1, 256, 2368, 2368, 2368, 256] + - [8, 11737.0] + - - [5888, 1856, 1, 256, 5888, 5888, 5888, 256] + - [30, 11650.0] + - - [512, 24000, 1, 1536, 512, 512, 512, 1536] + - [23, 12338.0] + - - [5888, 1408, 1, 256, 5888, 5888, 5888, 256] + - [8, 11692.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12023.0] + - - [1856, 4288, 1, 256, 1856, 1856, 1856, 256] + - [17, 11175.0] + - - [1024, 5056, 1, 128, 1024, 1024, 1024, 128] + - [22, 11153.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12391.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 12161.0] + - - [6144, 6000, 1, 2560, 6144, 6144, 6144, 2560] + - [8, 12584.0] + - - [2368, 6784, 1, 128, 2368, 2368, 2368, 128] + - [1, 11344.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 1024, 3328] + - [1, 11588.0] + - - [512, 48000, 1, 2048, 512, 512, 512, 2048] + - [40, 12531.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12214.0] + - - [1408, 4288, 1, 256, 1408, 1408, 1408, 256] + - [16, 11517.0] + - - [1024, 2368, 1, 256, 1024, 1024, 1024, 256] + - [16, 10983.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1408, 1280] + - [7, 11545.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12261.0] + - - [448, 5056, 1, 256, 448, 448, 448, 256] + - [16, 9257.0] + - - [1856, 1408, 1, 128, 1856, 1856, 1856, 128] + - [0, 10015.0] + - - [6784, 256, 1, 3328, 6784, 6784, 6784, 3328] + - [30, 11072.0] + - - [1408, 3584, 1, 256, 1408, 1408, 1408, 256] + - [16, 11376.0] + - - [4288, 448, 1, 256, 4288, 4288, 4288, 256] + - [0, 10649.0] + - - [1024, 1856, 1, 128, 1024, 1024, 1024, 128] + - [0, 10119.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12117.0] + - - [704, 5056, 1, 1280, 704, 704, 704, 1280] + - [37, 11001.0] + - - [2368, 704, 1, 3328, 2368, 2368, 2368, 3328] + - [7, 10696.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [22, 9223.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 11688.0] + - - [5888, 1024, 1, 256, 5888, 5888, 5888, 256] + - [36, 11430.0] + - - [1408, 2944, 1, 256, 1408, 1408, 1408, 256] + - [34, 11225.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12535.0] + - - [5056, 5056, 1, 256, 5056, 5056, 5056, 256] + - [8, 11942.0] + - - [704, 5056, 1, 128, 704, 704, 704, 128] + - [30, 9896.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 12064.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12626.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1408, 1280] + - [17, 11985.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12296.0] + - - [512, 6000, 1, 2560, 512, 512, 512, 2560] + - [36, 11520.0] + - - [2368, 704, 1, 1280, 2368, 2368, 2368, 1280] + - [7, 10517.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12314.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12058.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12621.0] + - - [6784, 448, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 11386.0] + - - [2944, 5888, 1, 256, 2944, 2944, 2944, 256] + - [17, 12252.0] + - - [4288, 2944, 1, 256, 4288, 4288, 4288, 256] + - [1, 11799.0] + - - [5888, 704, 1, 1280, 5888, 5888, 5888, 1280] + - [0, 11457.0] + - - [448, 5888, 1, 128, 448, 448, 448, 128] + - [22, 9150.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 5056, 1280] + - [17, 12070.0] + - - [448, 3584, 1, 1280, 448, 448, 448, 1280] + - [23, 9612.0] + - - [6784, 5888, 1, 256, 6784, 6784, 6784, 256] + - [17, 12432.0] + - - [1024, 1408, 1, 256, 1024, 1024, 1024, 256] + - [22, 10264.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 11318.0] + - - [1856, 6784, 1, 128, 1856, 1856, 1856, 128] + - [23, 11183.0] + - - [5056, 704, 1, 3328, 5056, 5056, 5056, 3328] + - [22, 11586.0] + - - [1408, 1856, 1, 256, 1408, 1408, 1408, 256] + - [0, 11029.0] + - - [2368, 5056, 1, 256, 2368, 2368, 2368, 256] + - [17, 11614.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 11981.0] + - - [704, 5888, 1, 256, 704, 704, 704, 256] + - [1, 10557.0] + - - [6784, 2944, 1, 128, 6784, 6784, 6784, 128] + - [37, 11939.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12666.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3584, 3328] + - [16, 11253.0] + - - [448, 4288, 1, 256, 448, 448, 448, 256] + - [9, 9124.0] + - - [704, 2368, 1, 1280, 704, 704, 704, 1280] + - [41, 9703.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1856, 1280] + - [16, 11276.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1856, 1280] + - [17, 11677.0] + - - [256, 193600, 1, 64, 256, 256, 256, 64] + - [14, 7913.0] + - - [704, 2944, 1, 128, 704, 704, 704, 128] + - [7, 9315.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1408, 1280] + - [16, 11068.0] + - - [704, 6784, 1, 256, 704, 704, 704, 256] + - [2, 10434.0] + - - [6784, 704, 1, 256, 6784, 6784, 6784, 256] + - [16, 11231.0] + - - [5056, 1408, 1, 128, 5056, 5056, 5056, 128] + - [7, 11085.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [1, 12388.0] + - - [5056, 704, 1, 256, 5056, 5056, 5056, 256] + - [36, 11037.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12342.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 11941.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 11986.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12299.0] + - - [6784, 2944, 1, 256, 6784, 6784, 6784, 256] + - [17, 12311.0] + - - [1024, 1500, 1, 2560, 1024, 1024, 1024, 2560] + - [8, 11000.0] + - - [1856, 2368, 1, 256, 1856, 1856, 1856, 256] + - [0, 11003.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3584, 3328] + - [8, 12539.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 1024, 3328] + - [37, 12033.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12435.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 5056, 1280] + - [17, 12258.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 2368, 1280] + - [0, 11247.0] + - - [2944, 5888, 1, 128, 2944, 2944, 2944, 128] + - [37, 11915.0] + - - [704, 5888, 1, 1280, 704, 704, 704, 1280] + - [8, 11118.0] + - - [2368, 3584, 1, 128, 2368, 2368, 2368, 128] + - [23, 11087.0] + - - [1856, 5056, 1, 128, 1856, 1856, 1856, 128] + - [7, 11054.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 2944, 1280] + - [17, 12628.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1024, 1280] + - [8, 12107.0] + - - [4288, 1024, 1, 256, 4288, 4288, 4288, 256] + - [2, 11129.0] + - - [2944, 2368, 1, 128, 2944, 2944, 2944, 128] + - [30, 11313.0] + - - [5888, 448, 1, 1280, 5888, 5888, 5888, 1280] + - [0, 11040.0] + - - [704, 5888, 1, 3328, 704, 704, 704, 3328] + - [41, 11103.0] + - - [3584, 2944, 1, 256, 3584, 3584, 3584, 256] + - [40, 11766.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 1856, 3328] + - [16, 11336.0] + - - [512, 6000, 1, 2816, 512, 512, 512, 2816] + - [7, 11511.0] + - - [512, 24000, 1, 2048, 512, 512, 512, 2048] + - [23, 12309.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 1408, 3328] + - [17, 12361.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 1856, 3328] + - [7, 11240.0] + - - [2368, 2368, 1, 256, 2368, 2368, 2368, 256] + - [30, 10640.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12195.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 5888, 1280] + - [17, 12045.0] + - - [1024, 12544, 1, 256, 1024, 1024, 1024, 256] + - [20, 11823.0] + - - [5888, 448, 1, 128, 5888, 5888, 5888, 128] + - [16, 10313.0] + - - [512, 48000, 1, 2560, 512, 512, 512, 2560] + - [23, 12547.0] + - - [704, 6784, 1, 3328, 704, 704, 704, 3328] + - [1, 11268.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12662.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12218.0] + - - [448, 5888, 1, 3328, 448, 448, 448, 3328] + - [16, 9939.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 11298.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12413.0] + - - [4288, 5888, 1, 128, 4288, 4288, 4288, 128] + - [8, 11718.0] + - - [1408, 3584, 1, 128, 1408, 1408, 1408, 128] + - [16, 11245.0] + - - [448, 3584, 1, 128, 448, 448, 448, 128] + - [30, 8465.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 5888, 1280] + - [17, 12599.0] + - - [2368, 5888, 1, 128, 2368, 2368, 2368, 128] + - [1, 11392.0] + - - [3584, 5888, 1, 256, 3584, 3584, 3584, 256] + - [8, 12194.0] + - - [2368, 1024, 1, 128, 2368, 2368, 2368, 128] + - [22, 10012.0] + - - [2368, 704, 1, 128, 2368, 2368, 2368, 128] + - [0, 9762.0] + - - [3584, 2368, 1, 128, 3584, 3584, 3584, 128] + - [7, 11486.0] + - - [5056, 704, 1, 128, 5056, 5056, 5056, 128] + - [22, 10517.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12414.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12120.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12659.0] + - - [1856, 1856, 1, 256, 1856, 1856, 1856, 256] + - [0, 10815.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 6784, 1280] + - [17, 12120.0] + - - [4288, 3584, 1, 256, 4288, 4288, 4288, 256] + - [37, 11943.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12374.0] + - - [1024, 6000, 1, 1536, 1024, 1024, 1024, 1536] + - [1, 12171.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 4288, 1280] + - [17, 11690.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 11607.0] + - - [256, 6784, 1, 3328, 256, 256, 256, 3328] + - [7, 11063.0] + - - [512, 3000, 1, 1536, 512, 512, 512, 1536] + - [37, 10808.0] + - - [256, 5056, 1, 128, 256, 256, 256, 128] + - [0, 9435.0] + - - [5056, 1024, 1, 256, 5056, 5056, 5056, 256] + - [17, 10783.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12008.0] + - - [4288, 1408, 1, 128, 4288, 4288, 4288, 128] + - [16, 10891.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 11999.0] + - - [4288, 5056, 1, 256, 4288, 4288, 4288, 256] + - [8, 11907.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 12571.0] + - - [5056, 256, 1, 3328, 5056, 5056, 5056, 3328] + - [9, 11486.0] + - - [1024, 3000, 1, 2560, 1024, 1024, 1024, 2560] + - [36, 11520.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 11920.0] + - - [6784, 2368, 1, 128, 6784, 6784, 6784, 128] + - [36, 11582.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1856, 1280] + - [18, 11254.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 6784, 1280] + - [37, 12369.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1856, 1280] + - [0, 11198.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 3072, 1024] + - [23, 12538.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1408, 1280] + - [17, 12231.0] + - - [5888, 1856, 1, 128, 5888, 5888, 5888, 128] + - [30, 11498.0] + - - [448, 6784, 1, 128, 448, 448, 448, 128] + - [30, 9214.0] + - - [5056, 3584, 1, 128, 5056, 5056, 5056, 128] + - [31, 11698.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12683.0] + - - [6784, 1024, 1, 256, 6784, 6784, 6784, 256] + - [37, 11419.0] + - - [2944, 2368, 1, 256, 2944, 2944, 2944, 256] + - [30, 11514.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12485.0] + - - [1856, 1024, 1, 256, 1856, 1856, 1856, 256] + - [16, 10239.0] + - - [512, 48000, 1, 1536, 512, 512, 512, 1536] + - [8, 12556.0] + - - [3584, 448, 1, 1280, 3584, 3584, 3584, 1280] + - [16, 11181.0] + - - [448, 5888, 1, 256, 448, 448, 448, 256] + - [16, 9439.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 1408, 3328] + - [8, 12218.0] + - - [4288, 704, 1, 128, 4288, 4288, 4288, 128] + - [30, 9852.0] + - - [5056, 2944, 1, 256, 5056, 5056, 5056, 256] + - [23, 12008.0] + - - [6784, 5888, 1, 128, 6784, 6784, 6784, 128] + - [31, 12103.0] + - - [2944, 704, 1, 128, 2944, 2944, 2944, 128] + - [16, 10331.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 1408, 3328] + - [1, 12050.0] + - - [2368, 6784, 1, 256, 2368, 2368, 2368, 256] + - [17, 11799.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12303.0] + - - [5056, 4288, 1, 128, 5056, 5056, 5056, 128] + - [16, 11524.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 11874.0] + - - [1408, 1856, 1, 128, 1408, 1408, 1408, 128] + - [7, 10210.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 1408, 3328] + - [8, 12193.0] + - - [6784, 6784, 1, 256, 6784, 6784, 6784, 256] + - [17, 12424.0] + - - [5888, 5056, 1, 128, 5888, 5888, 5888, 128] + - [16, 11786.0] + - - [4288, 2368, 1, 128, 4288, 4288, 4288, 128] + - [7, 11337.0] + - - [2368, 2944, 1, 256, 2368, 2368, 2368, 256] + - [17, 11599.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12054.0] + - - [6784, 6784, 1, 128, 6784, 6784, 6784, 128] + - [31, 12161.0] + - - [5888, 5056, 1, 256, 5888, 5888, 5888, 256] + - [8, 12155.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 8448, 2816] + - [37, 12523.0] + - - [512, 6000, 1, 2048, 512, 512, 512, 2048] + - [36, 11498.0] + - - [3584, 448, 1, 256, 3584, 3584, 3584, 256] + - [16, 10594.0] + - - [448, 4288, 1, 128, 448, 448, 448, 128] + - [16, 8813.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [22, 10306.0] + - - [1408, 4288, 1, 128, 1408, 1408, 1408, 128] + - [22, 11295.0] + - - [2944, 704, 1, 3328, 2944, 2944, 2944, 3328] + - [16, 11366.0] + - - [3584, 3584, 1, 256, 3584, 3584, 3584, 256] + - [23, 11958.0] + - - [3584, 5056, 1, 256, 3584, 3584, 3584, 256] + - [1, 12064.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12072.0] + - - [704, 6784, 1, 128, 704, 704, 704, 128] + - [18, 10074.0] + - - [6784, 3584, 1, 256, 6784, 6784, 6784, 256] + - [17, 12268.0] + - - [1856, 1408, 1, 256, 1856, 1856, 1856, 256] + - [16, 10463.0] + - - [5056, 2368, 1, 128, 5056, 5056, 5056, 128] + - [0, 11440.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12364.0] + - - [5056, 6784, 1, 256, 5056, 5056, 5056, 256] + - [17, 12209.0] + - - [1856, 3584, 1, 128, 1856, 1856, 1856, 128] + - [36, 10899.0] + - - [3584, 6784, 1, 128, 3584, 3584, 3584, 128] + - [8, 11850.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 2368, 1280] + - [17, 12149.0] + - - [5056, 1856, 1, 256, 5056, 5056, 5056, 256] + - [0, 11545.0] + - - [1024, 3000, 1, 2816, 1024, 1024, 1024, 2816] + - [7, 11574.0] + - - [1024, 1856, 1, 256, 1024, 1024, 1024, 256] + - [0, 10693.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1408, 1280] + - [37, 12213.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 12329.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 7680, 2560] + - [8, 12381.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 4608, 1536] + - [8, 12461.0] + - - [5888, 5888, 1, 128, 5888, 5888, 5888, 128] + - [8, 12036.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12157.0] + - - [2944, 4288, 1, 256, 2944, 2944, 2944, 256] + - [31, 11649.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 11962.0] + - - [1024, 1500, 1, 2816, 1024, 1024, 1024, 2816] + - [1, 11037.0] + - - [1024, 6000, 1, 2048, 1024, 1024, 1024, 2048] + - [23, 12247.0] + - - [512, 24000, 1, 2560, 512, 512, 512, 2560] + - [37, 12364.0] + - - [6144, 3000, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12287.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 12152.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1856, 1280] + - [22, 10871.0] + - - [6784, 704, 1, 128, 6784, 6784, 6784, 128] + - [16, 11103.0] + - - [5056, 2944, 1, 128, 5056, 5056, 5056, 128] + - [22, 11447.0] + - - [1408, 5888, 1, 256, 1408, 1408, 1408, 256] + - [1, 11761.0] + - - [704, 2944, 1, 1280, 704, 704, 704, 1280] + - [37, 10695.0] + - - [3584, 704, 1, 1280, 3584, 3584, 3584, 1280] + - [22, 11194.0] + - - [5888, 2368, 1, 256, 5888, 5888, 5888, 256] + - [3, 11723.0] + - - [2944, 6784, 1, 128, 2944, 2944, 2944, 128] + - [37, 11968.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3584, 3328] + - [0, 11337.0] + - - [704, 2368, 1, 3328, 704, 704, 704, 3328] + - [5, 10218.0] + - - [4608, 6000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 12494.0] + - - [256, 5888, 1, 128, 256, 256, 256, 128] + - [30, 8389.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 2944, 1280] + - [17, 12277.0] + - - [5056, 448, 1, 3328, 5056, 5056, 5056, 3328] + - [16, 11494.0] + - - [6784, 704, 1, 3328, 6784, 6784, 6784, 3328] + - [16, 11517.0] + - - [5888, 4288, 1, 128, 5888, 5888, 5888, 128] + - [16, 11731.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 1408, 3328] + - [1, 12125.0] + - - [3584, 704, 1, 128, 3584, 3584, 3584, 128] + - [22, 10272.0] + - - [448, 5056, 1, 128, 448, 448, 448, 128] + - [16, 8722.0] + - - [5056, 3584, 1, 256, 5056, 5056, 5056, 256] + - [31, 12110.0] + - - [4288, 4288, 1, 256, 4288, 4288, 4288, 256] + - [17, 11915.0] + - - [1408, 5056, 1, 128, 1408, 1408, 1408, 128] + - [30, 11379.0] + - - [2944, 3584, 1, 128, 2944, 2944, 2944, 128] + - [37, 11643.0] + - - [3584, 2368, 1, 256, 3584, 3584, 3584, 256] + - [16, 11663.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12512.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 8448, 2816] + - [8, 12445.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12372.0] + - - [3072, 1500, 1, 128, 3072, 3072, 3072, 128] + - [16, 10045.0] + - - [2048, 3136, 1, 512, 2048, 2048, 2048, 512] + - [3, 11639.0] + - - [3025, 256, 64, 64, 3025, 3025, 3025, 64] + - [6, 6572.0] + - - [5888, 6784, 1, 256, 5888, 5888, 5888, 256] + - [17, 12401.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12201.0] + - - [256, 5056, 1, 1280, 256, 256, 256, 1280] + - [7, 11069.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 2944, 3328] + - [31, 12609.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12653.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12338.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 1024, 2048] + - [11, 12375.0] + - - [5888, 3584, 1, 128, 5888, 5888, 5888, 128] + - [37, 11826.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12590.0] + - - [704, 3584, 1, 128, 704, 704, 704, 128] + - [30, 5904.0] + - - [5888, 448, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 11131.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 2368, 1280] + - [17, 11784.0] + - - [4288, 2944, 1, 128, 4288, 4288, 4288, 128] + - [8, 11319.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12492.0] + - - [2944, 3584, 1, 256, 2944, 2944, 2944, 256] + - [17, 11769.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 10932.0] + - - [3584, 3584, 1, 128, 3584, 3584, 3584, 128] + - [16, 11491.0] + - - [3584, 704, 1, 256, 3584, 3584, 3584, 256] + - [16, 10841.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12008.0] + - - [704, 3584, 1, 1280, 704, 704, 704, 1280] + - [0, 10519.0] + - - [1024, 1408, 1, 128, 1024, 1024, 1024, 128] + - [7, 9744.0] + - - [1856, 6784, 1, 256, 1856, 1856, 1856, 256] + - [17, 11659.0] + - - [4288, 448, 1, 3328, 4288, 4288, 4288, 3328] + - [22, 11339.0] + - - [6784, 4288, 1, 128, 6784, 6784, 6784, 128] + - [16, 11774.0] + - - [6784, 704, 1, 1280, 6784, 6784, 6784, 1280] + - [0, 11495.0] + - - [3584, 6784, 1, 256, 3584, 3584, 3584, 256] + - [23, 12251.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12151.0] + - - [704, 6784, 1, 1280, 704, 704, 704, 1280] + - [1, 11252.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 11963.0] + - - [1024, 3584, 1, 128, 1024, 1024, 1024, 128] + - [7, 10500.0] + - - [2368, 2944, 1, 128, 2368, 2368, 2368, 128] + - [0, 11019.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12646.0] + - - [1408, 2368, 1, 128, 1408, 1408, 1408, 128] + - [30, 10728.0] + - - [5888, 2368, 1, 128, 5888, 5888, 5888, 128] + - [16, 11587.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 3584, 1280] + - [23, 12535.0] + - - [4288, 1856, 1, 256, 4288, 4288, 4288, 256] + - [16, 11340.0] + - - [1856, 5888, 1, 256, 1856, 1856, 1856, 256] + - [23, 11563.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12268.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 4288, 1280] + - [23, 11986.0] + - - [3584, 5056, 1, 128, 3584, 3584, 3584, 128] + - [16, 11686.0] + - - [4288, 2368, 1, 256, 4288, 4288, 4288, 256] + - [7, 11508.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12446.0] + - - [448, 6784, 1, 256, 448, 448, 448, 256] + - [16, 9554.0] + - - [1856, 2368, 1, 128, 1856, 1856, 1856, 128] + - [16, 10594.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12176.0] + - - [1408, 6784, 1, 128, 1408, 1408, 1408, 128] + - [0, 11450.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 11784.0] + - - [3584, 448, 1, 128, 3584, 3584, 3584, 128] + - [36, 9266.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 11470.0] + - - [1856, 5056, 1, 256, 1856, 1856, 1856, 256] + - [17, 11371.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12397.0] + - - [1024, 4288, 1, 256, 1024, 1024, 1024, 256] + - [0, 11357.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 5888, 3328] + - [23, 12558.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12515.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 2368, 1280] + - [30, 11242.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 5056, 1280] + - [17, 12397.0] + - - [8448, 6000, 1, 2816, 8448, 8448, 8448, 2816] + - [20, 12495.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 3584, 1280] + - [23, 12234.0] + - - [1024, 6784, 1, 256, 1024, 1024, 1024, 256] + - [1, 11537.0] + - - [6784, 448, 1, 256, 6784, 6784, 6784, 256] + - [16, 11052.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 5124, 2048] + - [8, 12135.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 12417.0] + - - [2944, 1408, 1, 128, 2944, 2944, 2944, 128] + - [30, 10582.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12486.0] + - - [704, 2368, 1, 128, 704, 704, 704, 128] + - [22, 5284.0] + - - [3072, 1500, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 11408.0] + - - [3584, 4288, 1, 256, 3584, 3584, 3584, 256] + - [23, 11904.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 12125.0] + - - [5888, 4288, 1, 256, 5888, 5888, 5888, 256] + - [8, 12051.0] + - - [5056, 1408, 1, 256, 5056, 5056, 5056, 256] + - [37, 11565.0] + - - [3584, 1024, 1, 256, 3584, 3584, 3584, 256] + - [16, 11156.0] + - - [512, 6000, 1, 1536, 512, 512, 512, 1536] + - [7, 11460.0] + - - [5888, 5888, 1, 256, 5888, 5888, 5888, 256] + - [17, 12367.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 11821.0] + - - [448, 6784, 1, 3328, 448, 448, 448, 3328] + - [37, 10252.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 11956.0] + - - [3072, 6000, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12315.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 2944, 3328] + - [1, 11610.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 3584, 1280] + - [23, 12457.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12056.0] + - - [2944, 5056, 1, 256, 2944, 2944, 2944, 256] + - [8, 11999.0] + - - [5888, 256, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 11000.0] + - - [2944, 4288, 1, 128, 2944, 2944, 2944, 128] + - [22, 11499.0] + - - [3584, 1408, 1, 256, 3584, 3584, 3584, 256] + - [1, 11386.0] + - - [704, 3584, 1, 3328, 704, 704, 704, 3328] + - [16, 10568.0] + - - [5056, 448, 1, 1280, 5056, 5056, 5056, 1280] + - [16, 11335.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12091.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 4288, 1280] + - [23, 12351.0] + - - [1024, 3000, 1, 2048, 1024, 1024, 1024, 2048] + - [36, 11447.0] + - - [2944, 1024, 1, 256, 2944, 2944, 2944, 256] + - [30, 10959.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 11824.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 10866.0] + - - [6784, 5056, 1, 256, 6784, 6784, 6784, 256] + - [1, 12203.0] + - - [1856, 1856, 1, 128, 1856, 1856, 1856, 128] + - [30, 10365.0] + - - [4288, 5888, 1, 256, 4288, 4288, 4288, 256] + - [17, 12082.0] + - - [2944, 6784, 1, 256, 2944, 2944, 2944, 256] + - [1, 12283.0] + - - [2944, 2944, 1, 128, 2944, 2944, 2944, 128] + - [37, 11499.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1856, 1280] + - [17, 12036.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 3584, 1280] + - [1, 11942.0] + - - [4288, 448, 1, 128, 4288, 4288, 4288, 128] + - [0, 9385.0] + - - [5056, 256, 1, 1280, 5056, 5056, 5056, 1280] + - [2, 11376.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 1856, 3328] + - [16, 10974.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 1024, 3328] + - [17, 11856.0] + - - [5056, 448, 1, 256, 5056, 5056, 5056, 256] + - [16, 10307.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 2944, 3328] + - [17, 12120.0] + - - [704, 4288, 1, 3328, 704, 704, 704, 3328] + - [7, 10363.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1024, 1280] + - [8, 11179.0] + - - [6784, 1856, 1, 256, 6784, 6784, 6784, 256] + - [16, 11687.0] + - - [512, 48000, 1, 2816, 512, 512, 512, 2816] + - [17, 12609.0] + - - [512, 3000, 1, 2816, 512, 512, 512, 2816] + - [1, 10997.0] + - - [1024, 5888, 1, 256, 1024, 1024, 1024, 256] + - [16, 11491.0] + - - [6784, 1408, 1, 256, 6784, 6784, 6784, 256] + - [8, 11830.0] + - - [1408, 2368, 1, 256, 1408, 1408, 1408, 256] + - [16, 11102.0] + - - [1408, 1408, 1, 256, 1408, 1408, 1408, 256] + - [16, 10443.0] + - - [2368, 2368, 1, 128, 2368, 2368, 2368, 128] + - [0, 10816.0] + - - [6784, 1408, 1, 128, 6784, 6784, 6784, 128] + - [23, 11541.0] + - - [1408, 5056, 1, 256, 1408, 1408, 1408, 256] + - [17, 11793.0] + - - [512, 50176, 1, 128, 512, 512, 512, 128] + - [0, 11709.0] + - - [4288, 3584, 1, 128, 4288, 4288, 4288, 128] + - [8, 11563.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 3584, 1280] + - [17, 12461.0] + - - [1856, 1024, 1, 128, 1856, 1856, 1856, 128] + - [4, 9548.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1024, 1536] + - [8, 12410.0] + - - [704, 4288, 1, 256, 704, 704, 704, 256] + - [30, 9865.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12172.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12144.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 12184.0] + - - [5888, 256, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 10817.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 2368, 3328] + - [0, 11326.0] + - - [2944, 704, 1, 256, 2944, 2944, 2944, 256] + - [16, 10259.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 2368, 3328] + - [5, 11189.0] + - - [704, 3584, 1, 256, 704, 704, 704, 256] + - [16, 9646.0] + - - [704, 2944, 1, 3328, 704, 704, 704, 3328] + - [8, 10931.0] + - - [6784, 1024, 1, 128, 6784, 6784, 6784, 128] + - [30, 10844.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 2944, 3328] + - [30, 11411.0] + - - [2944, 5056, 1, 128, 2944, 2944, 2944, 128] + - [0, 11595.0] + - - [1408, 6784, 1, 256, 1408, 1408, 1408, 256] + - [17, 11751.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12264.0] + - - [4288, 6784, 1, 128, 4288, 4288, 4288, 128] + - [8, 11809.0] + - - [1408, 2944, 1, 128, 1408, 1408, 1408, 128] + - [16, 10944.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12643.0] + - - [4288, 1856, 1, 128, 4288, 4288, 4288, 128] + - [0, 11143.0] + - - [1856, 2944, 1, 128, 1856, 1856, 1856, 128] + - [16, 10813.0] + - - [6784, 448, 1, 128, 6784, 6784, 6784, 128] + - [16, 10867.0] + - - [448, 5056, 1, 1280, 448, 448, 448, 1280] + - [17, 10340.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12233.0] + - - [2368, 1856, 1, 128, 2368, 2368, 2368, 128] + - [30, 10387.0] + - - [4288, 704, 1, 256, 4288, 4288, 4288, 256] + - [0, 10842.0] + - - [5888, 704, 1, 256, 5888, 5888, 5888, 256] + - [16, 11180.0] + - - [3584, 1024, 1, 128, 3584, 3584, 3584, 128] + - [7, 10720.0] + - - [256, 5888, 1, 3328, 256, 256, 256, 3328] + - [11, 10773.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 1408, 3328] + - [17, 12104.0] + - - [6784, 4288, 1, 256, 6784, 6784, 6784, 256] + - [17, 12100.0] + - - [5888, 256, 1, 256, 5888, 5888, 5888, 256] + - [32, 10086.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 6784, 1280] + - [8, 12082.0] + - - [5888, 1024, 1, 128, 5888, 5888, 5888, 128] + - [7, 11283.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 6784, 1280] + - [17, 12508.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1024, 1280] + - [1, 11996.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1408, 1280] + - [1, 12006.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 1408, 3328] + - [1, 11448.0] + - - [2944, 1856, 1, 128, 2944, 2944, 2944, 128] + - [0, 10613.0] + - - [256, 6784, 1, 128, 256, 256, 256, 128] + - [16, 8227.0] + - - [5056, 6784, 1, 128, 5056, 5056, 5056, 128] + - [17, 11807.0] + - - [4288, 5056, 1, 128, 4288, 4288, 4288, 128] + - [16, 11512.0] + - - [1856, 5888, 1, 128, 1856, 1856, 1856, 128] + - [8, 11130.0] + - - [3584, 1856, 1, 256, 3584, 3584, 3584, 256] + - [16, 11542.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 12350.0] + - - [704, 5888, 1, 128, 704, 704, 704, 128] + - [16, 10091.0] + - - [6784, 3584, 1, 128, 6784, 6784, 6784, 128] + - [31, 11956.0] + - - [5124, 1500, 1, 2048, 5124, 5124, 5124, 2048] + - [1, 11762.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12302.0] + - - [1408, 1408, 1, 128, 1408, 1408, 1408, 128] + - [16, 9634.0] + - - [5056, 2368, 1, 256, 5056, 5056, 5056, 256] + - [11, 11553.0] + - - [4288, 704, 1, 3328, 4288, 4288, 4288, 3328] + - [16, 11385.0] + - - [448, 3584, 1, 256, 448, 448, 448, 256] + - [22, 8901.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 2368, 1280] + - [1, 11171.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 2944, 3328] + - [17, 12121.0] + - - [6144, 1500, 1, 2560, 6144, 6144, 6144, 2560] + - [1, 12154.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 1024, 3328] + - [7, 11164.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 2944, 1280] + - [17, 12502.0] + - - [5888, 3584, 1, 256, 5888, 5888, 5888, 256] + - [17, 12189.0] + - - [2368, 5056, 1, 128, 2368, 2368, 2368, 128] + - [23, 11202.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 1408, 3328] + - [16, 11682.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12546.0] + - - [7680, 6000, 1, 2560, 7680, 7680, 7680, 2560] + - [3, 12523.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12227.0] + - - [512, 3000, 1, 2560, 512, 512, 512, 2560] + - [8, 10949.0] + - - [704, 2944, 1, 256, 704, 704, 704, 256] + - [16, 9485.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12625.0] + - - [2368, 4288, 1, 128, 2368, 2368, 2368, 128] + - [30, 11028.0] + - - [1024, 6784, 1, 128, 1024, 1024, 1024, 128] + - [7, 11134.0] + - - [1024, 1500, 1, 1536, 1024, 1024, 1024, 1536] + - [8, 10805.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1408, 1280] + - [30, 10577.0] + - - [3072, 3000, 1, 1024, 3072, 3072, 3072, 1024] + - [1, 12054.0] + - - [448, 4288, 1, 3328, 448, 448, 448, 3328] + - [17, 10078.0] + - - [2368, 1408, 1, 256, 2368, 2368, 2368, 256] + - [30, 10301.0] + - - [704, 2368, 1, 256, 704, 704, 704, 256] + - [33, 7822.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 1024, 2560] + - [11, 12386.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12181.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 5124, 1760] + - [31, 12241.0] + - - [4288, 448, 1, 1280, 4288, 4288, 4288, 1280] + - [0, 11171.0] + - - [5888, 704, 1, 3328, 5888, 5888, 5888, 3328] + - [16, 11510.0] + - - [5056, 256, 1, 128, 5056, 5056, 5056, 128] + - [7, 8747.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 1024, 3328] + - [8, 12060.0] + - - [1408, 5888, 1, 128, 1408, 1408, 1408, 128] + - [30, 11301.0] + - - [512, 3136, 1, 2048, 512, 512, 512, 2048] + - [9, 11355.0] + - - [1408, 1024, 1, 256, 1408, 1408, 1408, 256] + - [16, 9708.0] + - - [8448, 1500, 1, 2816, 8448, 8448, 8448, 2816] + - [1, 12203.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [1, 12554.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12480.0] + - - [704, 5056, 1, 3328, 704, 704, 704, 3328] + - [17, 11115.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12442.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 2368, 3328] + - [17, 12112.0] + - - [2368, 3584, 1, 256, 2368, 2368, 2368, 256] + - [8, 11304.0] + - - [4608, 3000, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 12064.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 12373.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 5124, 4096] + - [26, 12116.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12698.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1856, 1280] + - [1, 11452.0] + - - [4608, 1500, 1, 1536, 4608, 4608, 4608, 1536] + - [1, 11968.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 1024, 2816] + - [3, 12579.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 5124, 2560] + - [1, 12189.0] + - - [2944, 1408, 1, 256, 2944, 2944, 2944, 256] + - [30, 8414.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12110.0] + - - [5888, 2944, 1, 128, 5888, 5888, 5888, 128] + - [8, 11850.0] + - - [2944, 1024, 1, 128, 2944, 2944, 2944, 128] + - [16, 10761.0] + - - [5124, 700, 1, 2048, 5124, 5124, 5124, 2048] + - [0, 11188.0] + - - [6784, 5056, 1, 128, 6784, 6784, 6784, 128] + - [8, 11851.0] + - - [256, 12544, 1, 1024, 256, 256, 256, 1024] + - [8, 11735.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12250.0] + - - [2368, 1856, 1, 256, 2368, 2368, 2368, 256] + - [16, 10981.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [39, 9909.0] + - - [5056, 5056, 1, 128, 5056, 5056, 5056, 128] + - [30, 11570.0] + - - [448, 3584, 1, 3328, 448, 448, 448, 3328] + - [5, 9884.0] + - - [5888, 256, 1, 128, 5888, 5888, 5888, 128] + - [27, 9135.0] + - - [3584, 1856, 1, 128, 3584, 3584, 3584, 128] + - [36, 10753.0] + - - [4288, 4288, 1, 128, 4288, 4288, 4288, 128] + - [31, 11517.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 1856, 3328] + - [5, 11668.0] + - - [1856, 4288, 1, 128, 1856, 1856, 1856, 128] + - [7, 10656.0] + - - [1024, 6000, 1, 2560, 1024, 1024, 1024, 2560] + - [8, 12194.0] + - - [1024, 5056, 1, 256, 1024, 1024, 1024, 256] + - [0, 11365.0] + - - [5056, 5888, 1, 128, 5056, 5056, 5056, 128] + - [31, 11778.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 11473.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1024, 1536] + - [8, 12535.0] + - - [5888, 448, 1, 256, 5888, 5888, 5888, 256] + - [16, 10641.0] + - - [5888, 6784, 1, 128, 5888, 5888, 5888, 128] + - [8, 12099.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 6784, 1280] + - [1, 12526.0] + - - [5056, 704, 1, 1280, 5056, 5056, 5056, 1280] + - [0, 11566.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 1024, 2560] + - [8, 12376.0] + - - [1024, 2368, 1, 128, 1024, 1024, 1024, 128] + - [7, 9810.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 3072, 1024] + - [26, 12545.0] + - - [1024, 5888, 1, 128, 1024, 1024, 1024, 128] + - [22, 10773.0] + - - [3584, 5888, 1, 128, 3584, 3584, 3584, 128] + - [20, 11837.0] + - - [5056, 5888, 1, 256, 5056, 5056, 5056, 256] + - [31, 12154.0] + - - [2368, 1024, 1, 256, 2368, 2368, 2368, 256] + - [2, 10391.0] + - - [2944, 1856, 1, 256, 2944, 2944, 2944, 256] + - [16, 11410.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1856, 1280] + - [37, 12073.0] + - - [8448, 3000, 1, 2816, 8448, 8448, 8448, 2816] + - [17, 12263.0] + - - [6784, 448, 1, 3328, 6784, 6784, 6784, 3328] + - [0, 11497.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 5056, 1280] + - [1, 11989.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 1408, 3328] + - [7, 11171.0] + - - [7680, 1500, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12227.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 5888, 1280] + - [1, 12514.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 1856, 3328] + - [1, 12092.0] + - - [1024, 2944, 1, 256, 1024, 1024, 1024, 256] + - [16, 10809.0] + - - [448, 6784, 1, 1280, 448, 448, 448, 1280] + - [23, 10280.0] + - - [704, 5056, 1, 256, 704, 704, 704, 256] + - [16, 10313.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 11594.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 11628.0] + - - [5056, 256, 1, 256, 5056, 5056, 5056, 256] + - [9, 10479.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 2944, 3328] + - [8, 12174.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 12029.0] + - - [2944, 704, 1, 1280, 2944, 2944, 2944, 1280] + - [16, 11192.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 2944, 1280] + - [1, 12273.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1856, 1280] + - [1, 11942.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 4608, 1536] + - [8, 12250.0] + - - [4288, 1408, 1, 256, 4288, 4288, 4288, 256] + - [8, 11190.0] + - - [5888, 1408, 1, 128, 5888, 5888, 5888, 128] + - [30, 11366.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 4288, 1280] + - [1, 11818.0] + - - [6784, 2368, 1, 256, 6784, 6784, 6784, 256] + - [17, 11822.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 1024, 2816] + - [8, 12524.0] + - - [1856, 2944, 1, 256, 1856, 1856, 1856, 256] + - [0, 11039.0] + - - [5056, 1024, 1, 128, 5056, 5056, 5056, 128] + - [8, 11086.0] + - - [7680, 3000, 1, 2560, 7680, 7680, 7680, 2560] + - [1, 12382.0] + - - [4224, 1500, 1, 176, 4224, 4224, 4224, 176] + - [16, 11100.0] + - - [5124, 700, 1, 2560, 5124, 5124, 5124, 2560] + - [16, 11219.0] + - - [6784, 256, 1, 128, 6784, 6784, 6784, 128] + - [16, 10041.0] + - - [5888, 704, 1, 128, 5888, 5888, 5888, 128] + - [16, 10940.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 11681.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 2368, 3328] + - [23, 12029.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 11921.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 6144, 2560] + - [8, 12576.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 1024, 3328] + - [8, 12175.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 1024, 3328] + - [17, 11504.0] + - - [5124, 1500, 1, 2560, 5124, 5124, 5124, 2560] + - [1, 11745.0] + - - [4288, 6784, 1, 256, 4288, 4288, 4288, 256] + - [17, 12094.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3584, 3328] + - [1, 12344.0] + - - [5888, 2944, 1, 256, 5888, 5888, 5888, 256] + - [17, 12231.0] + - - [448, 4288, 1, 1280, 448, 448, 448, 1280] + - [17, 9996.0] + - - [1024, 4288, 1, 128, 1024, 1024, 1024, 128] + - [7, 10969.0] + - - [5056, 4288, 1, 256, 5056, 5056, 5056, 256] + - [8, 11897.0] + - - [1024, 3584, 1, 256, 1024, 1024, 1024, 256] + - [0, 11124.0] + - - [448, 5888, 1, 1280, 448, 448, 448, 1280] + - [7, 9891.0] + - - [512, 3000, 1, 2048, 512, 512, 512, 2048] + - [9, 10952.0] + - - [5056, 448, 1, 128, 5056, 5056, 5056, 128] + - [30, 8965.0] + - - [4288, 704, 1, 1280, 4288, 4288, 4288, 1280] + - [16, 11298.0] + - - [3584, 2944, 1, 128, 3584, 3584, 3584, 128] + - [8, 11520.0] + - - [6784, 256, 1, 1280, 6784, 6784, 6784, 1280] + - [16, 11105.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 2368, 3328] + - [1, 12200.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 2368, 1280] + - [16, 11266.0] + - - [448, 5056, 1, 3328, 448, 448, 448, 3328] + - [41, 10486.0] + - - [3584, 4288, 1, 128, 3584, 3584, 3584, 128] + - [22, 11566.0] + - - [1024, 6000, 1, 2816, 1024, 1024, 1024, 2816] + - [17, 12245.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12399.0] + - - [2368, 704, 1, 256, 2368, 2368, 2368, 256] + - [0, 10147.0] + - - [3584, 1408, 1, 128, 3584, 3584, 3584, 128] + - [7, 11091.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1856, 1280] + - [1, 11947.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 2944, 1280] + - [30, 11311.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3584, 3328] + - [23, 12526.0] + - - [2368, 4288, 1, 256, 2368, 2368, 2368, 256] + - [0, 11306.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 1024, 3328] + - [16, 11555.0] + - - [1024, 2944, 1, 128, 1024, 1024, 1024, 128] + - [16, 9676.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1024, 1280] + - [16, 11419.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12356.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 1024, 3328] + - [1, 11378.0] + - - [256, 6784, 1, 1280, 256, 256, 256, 1280] + - [7, 11009.0] + - - [1856, 3584, 1, 256, 1856, 1856, 1856, 256] + - [31, 11227.0] + - - [6784, 1856, 1, 128, 6784, 6784, 6784, 128] + - [30, 11573.0] + - - [1024, 1500, 1, 2048, 1024, 1024, 1024, 2048] + - [8, 10980.0] + - - [512, 24000, 1, 2816, 512, 512, 512, 2816] + - [23, 12408.0] + - - [256, 5888, 1, 1280, 256, 256, 256, 1280] + - [23, 10697.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12396.0] + - - [2368, 1408, 1, 128, 2368, 2368, 2368, 128] + - [16, 9861.0] + - - [1408, 1024, 1, 128, 1408, 1408, 1408, 128] + - [22, 9775.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 6784, 3328] + - [1, 12567.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [1, 12210.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 2368, 1280] + - [17, 12105.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1408, 1280] + - [22, 11353.0] + - - [704, 4288, 1, 128, 704, 704, 704, 128] + - [0, 9699.0] + - - [2944, 2944, 1, 256, 2944, 2944, 2944, 256] + - [17, 11844.0] + - - [6784, 256, 1, 256, 6784, 6784, 6784, 256] + - [30, 10616.0] + - - [256, 5056, 1, 3328, 256, 256, 256, 3328] + - [13, 11415.0] + - - [5056, 1856, 1, 128, 5056, 5056, 5056, 128] + - [36, 10523.0] + - - [1024, 3000, 1, 1536, 1024, 1024, 1024, 1536] + - [16, 11271.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 5056, 3328] + - [1, 12270.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 4288, 3328] + - [1, 12341.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1024, 1280] + - [7, 11387.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 5888, 3328] + - [1, 12598.0] + - - [704, 4288, 1, 1280, 704, 704, 704, 1280] + - [16, 10279.0] + - - [128, 50176, 1, 512, 128, 128, 128, 512] + - [26, 11738.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 1024, 2048] + - [3, 12584.0] + - - [4288, 1024, 1, 128, 4288, 4288, 4288, 128] + - [36, 10347.0] + - - [784, 128, 128, 512, 784, 784, 784, 512] + - [3, 10568.0] + - - [784, 512, 256, 128, 784, 784, 784, 128] + - [3, 10484.0] + - - [3136, 256, 256, 64, 3136, 3136, 3136, 64] + - [15, 7677.0] + - - [784, 512, 128, 128, 784, 784, 784, 128] + - [3, 10364.0] + - - [784, 128, 256, 512, 784, 784, 784, 512] + - [3, 10743.0] + - - [3136, 256, 128, 64, 3136, 3136, 3136, 64] + - [0, 8629.0] + - - [4096, 512, 1, 1024, 4096, 4096, 4096, 1024] + - [16, 11353.0] + - - [2048, 768, 1, 512, 2048, 2048, 2048, 512] + - [3, 10833.0] + - - [4096, 512, 1, 2048, 4096, 4096, 4096, 2048] + - [7, 11433.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 4096, 2048] + - [1, 12186.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 11393.0] + - - [2048, 1024, 1, 4096, 2048, 2048, 2048, 4096] + - [22, 11409.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12005.0] + - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] + - [0, 11187.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 4096] + - [8, 12083.0] + - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] + - [16, 11250.0] + - - [4096, 384, 1, 2048, 4096, 4096, 4096, 2048] + - [2, 11110.0] + - - [1225, 192, 64, 384, 1225, 1225, 1225, 384] + - [16, 11259.0] + - - [289, 128, 64, 1024, 289, 289, 289, 1024] + - [26, 8493.0] + - - [4096, 384, 1, 1536, 4096, 4096, 4096, 1536] + - [1, 11286.0] + - - [289, 192, 64, 1024, 289, 289, 289, 1024] + - [39, 8356.0] + - - [4096, 384, 1, 1280, 4096, 4096, 4096, 1280] + - [1, 11241.0] + - - [4096, 448, 1, 1280, 4096, 4096, 4096, 1280] + - [22, 10804.0] + - - [289, 256, 64, 1024, 289, 289, 289, 1024] + - [11, 8770.0] + - - [4096, 448, 1, 2048, 4096, 4096, 4096, 2048] + - [7, 10862.0] + - - [289, 384, 64, 1024, 289, 289, 289, 1024] + - [26, 9121.0] + - - [1024, 3594, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11561.0] + - - [4096, 3103, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12058.0] + - - [4096, 3136, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12241.0] + - - [1024, 3141, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11926.0] + - - [4096, 3559, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12323.0] + - - [4096, 3368, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12010.0] + - - [1024, 3335, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11659.0] + - - [1024, 3510, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11713.0] + - - [4096, 3209, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11892.0] + - - [4096, 3322, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12416.0] + - - [1024, 3400, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11856.0] + - - [1024, 3995, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11738.0] + - - [1024, 3503, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11673.0] + - - [4096, 3594, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11929.0] + - - [4096, 3473, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12078.0] + - - [4096, 3522, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12222.0] + - - [1024, 3103, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11754.0] + - - [1024, 3214, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10655.0] + - - [4096, 3449, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12221.0] + - - [1024, 3136, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11906.0] + - - [1024, 3955, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11680.0] + - - [1024, 3780, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 12037.0] + - - [1024, 3906, 1, 33708, 1024, 1024, 1024, 33708] + - [31, 11544.0] + - - [1024, 3386, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11824.0] + - - [4096, 3396, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12050.0] + - - [1024, 3183, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 12029.0] + - - [1024, 3098, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 11000.0] + - - [1024, 3548, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10505.0] + - - [1024, 3224, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 10747.0] + - - [4096, 3469, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12046.0] + - - [1024, 3582, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11540.0] + - - [1024, 2977, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11399.0] + - - [1024, 3939, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11246.0] + - - [4096, 3176, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12349.0] + - - [1024, 3559, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11454.0] + - - [1024, 3478, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11617.0] + - - [4096, 3343, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11895.0] + - - [4096, 3440, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12256.0] + - - [1024, 3996, 1, 33708, 1024, 1024, 1024, 33708] + - [17, 11794.0] + - - [1024, 4012, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11810.0] + - - [1024, 3322, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11631.0] + - - [1024, 3990, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11783.0] + - - [1024, 3314, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11619.0] + - - [4096, 3513, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12149.0] + - - [1024, 3562, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11479.0] + - - [1024, 3443, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12014.0] + - - [1024, 3554, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11457.0] + - - [1024, 3063, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11682.0] + - - [4096, 3460, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12020.0] + - - [1024, 3209, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11108.0] + - - [1024, 3147, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11168.0] + - - [4096, 3387, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12057.0] + - - [4096, 3436, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12228.0] + - - [1024, 3341, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11647.0] + - - [1024, 3516, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10524.0] + - - [4096, 3277, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12184.0] + - - [1024, 3454, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12045.0] + - - [1024, 3969, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11671.0] + - - [1024, 3999, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11773.0] + - - [1024, 4032, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11851.0] + - - [4096, 3541, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12254.0] + - - [4096, 3334, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11845.0] + - - [1024, 3365, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11724.0] + - - [1024, 3527, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10511.0] + - - [1024, 3190, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11414.0] + - - [4096, 3906, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12189.0] + - - [1024, 3593, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11544.0] + - - [1024, 3336, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11641.0] + - - [4096, 3504, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12157.0] + - - [4096, 3977, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11988.0] + - - [1024, 3906, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11542.0] + - - [4096, 3415, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12127.0] + - - [1024, 3295, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11517.0] + - - [4096, 3321, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12285.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11772.0] + - - [1024, 3408, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11895.0] + - - [1024, 3522, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 10552.0] + - - [4096, 3751, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12177.0] + - - [4096, 3378, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12028.0] + - - [1024, 3925, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11607.0] + - - [1024, 3990, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11521.0] + - - [1024, 3290, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11525.0] + - - [4096, 3500, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12112.0] + - - [4096, 3565, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12387.0] + - - [1024, 3484, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11622.0] + - - [4096, 3395, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12068.0] + - - [1024, 3681, 1, 1024, 1024, 1024, 1024, 1024] + - [37, 11321.0] + - - [1024, 3584, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11333.0] + - - [4096, 3093, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12083.0] + - - [1024, 4050, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11773.0] + - - [1024, 3301, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11557.0] + - - [1024, 3581, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11513.0] + - - [4096, 3374, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11985.0] + - - [1024, 3449, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11968.0] + - - [4096, 3215, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11928.0] + - - [4096, 3312, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12321.0] + - - [4096, 3479, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12059.0] + - - [4096, 3544, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12294.0] + - - [1024, 3263, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11447.0] + - - [4096, 3455, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12231.0] + - - [1024, 3379, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11746.0] + - - [1024, 3490, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10460.0] + - - [1024, 3368, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10823.0] + - - [4096, 3186, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12354.0] + - - [1024, 3428, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11907.0] + - - [4096, 3561, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12316.0] + - - [4096, 3418, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12190.0] + - - [1024, 3064, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11638.0] + - - [4096, 3259, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12116.0] + - - [4096, 3308, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12296.0] + - - [1024, 3533, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11388.0] + - - [1024, 3344, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11646.0] + - - [1024, 4030, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11594.0] + - - [4096, 3459, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12033.0] + - - [1024, 3572, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11488.0] + - - [1024, 3925, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11432.0] + - - [4096, 3435, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12202.0] + - - [1024, 3956, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11672.0] + - - [1024, 3463, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11582.0] + - - [4096, 3182, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12340.0] + - - [4096, 3976, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11981.0] + - - [1024, 3417, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11904.0] + - - [1024, 3528, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10510.0] + - - [4096, 3446, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12220.0] + - - [1024, 3543, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11409.0] + - - [4096, 3287, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12193.0] + - - [1024, 3499, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11683.0] + - - [1024, 3231, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11332.0] + - - [4096, 3519, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12166.0] + - - [4096, 3552, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12336.0] + - - [1024, 3458, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11525.0] + - - [1024, 3374, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11717.0] + - - [1024, 3396, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10998.0] + - - [1024, 2967, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11013.0] + - - [4096, 3482, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12069.0] + - - [1024, 3226, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11320.0] + - - [4096, 3377, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12027.0] + - - [4096, 3426, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12193.0] + - - [4096, 2935, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12347.0] + - - [1024, 3439, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12011.0] + - - [4096, 3267, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12145.0] + - - [4096, 3499, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12129.0] + - - [4096, 3356, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11935.0] + - - [4096, 3939, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12277.0] + - - [1024, 3526, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11361.0] + - - [1024, 3859, 1, 33708, 1024, 1024, 1024, 33708] + - [26, 11465.0] + - - [1024, 3385, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11771.0] + - - [1024, 3496, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10444.0] + - - [4096, 3141, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12202.0] + - - [4096, 3510, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12216.0] + - - [1024, 3434, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11977.0] + - - [4096, 3969, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11940.0] + - - [1024, 3121, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11753.0] + - - [1024, 3232, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 10732.0] + - - [1024, 4030, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11898.0] + - - [1024, 3780, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 12119.0] + - - [1024, 3969, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11392.0] + - - [4096, 3527, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12273.0] + - - [4096, 3336, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11919.0] + - - [4096, 3290, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12273.0] + - - [1024, 3469, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11546.0] + - - [4096, 3490, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12121.0] + - - [4096, 3064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12265.0] + - - [4096, 3582, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12398.0] + - - [1024, 3956, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11504.0] + - - [4096, 3417, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12157.0] + - - [1024, 2736, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11460.0] + - - [1024, 3205, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10660.0] + - - [1024, 3143, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 11191.0] + - - [1024, 4020, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11824.0] + - - [1024, 3318, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11614.0] + - - [4096, 3364, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11939.0] + - - [1024, 3353, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11670.0] + - - [1024, 3464, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10481.0] + - - [4096, 3205, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11927.0] + - - [4096, 3318, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12328.0] + - - [1024, 3402, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11828.0] + - - [4096, 3181, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12385.0] + - - [4096, 3550, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12326.0] + - - [4096, 3445, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12253.0] + - - [1024, 3138, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11851.0] + - - [4096, 3079, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11996.0] + - - [4096, 3144, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12245.0] + - - [4096, 3860, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12085.0] + - - [1024, 3515, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11679.0] + - - [4096, 3408, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12113.0] + - - [1024, 3181, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 12012.0] + - - [4096, 3298, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12247.0] + - - [4096, 3585, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11943.0] + - - [1024, 3550, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11423.0] + - - [1024, 4020, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11558.0] + - - [4096, 3481, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12062.0] + - - [4096, 3530, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12273.0] + - - [4096, 3425, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12191.0] + - - [4096, 4026, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12121.0] + - - [1024, 3860, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 11261.0] + - - [4096, 3975, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11952.0] + - - [1024, 3286, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11494.0] + - - [1024, 3176, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11334.0] + - - [1024, 3894, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11533.0] + - - [4096, 3355, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11917.0] + - - [4096, 3404, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12129.0] + - - [1024, 3501, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11653.0] + - - [4096, 3245, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12033.0] + - - [1024, 3431, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11942.0] + - - [1024, 4000, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11333.0] + - - [4096, 3509, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12161.0] + - - [4096, 3558, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12353.0] + - - [1024, 3535, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11381.0] + - - [1024, 3414, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11917.0] + - - [1024, 3445, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10942.0] + - - [1024, 3436, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 10980.0] + - - [4096, 3472, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12023.0] + - - [1024, 3211, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11297.0] + - - [4096, 3383, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12019.0] + - - [4096, 3448, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12295.0] + - - [1024, 3343, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11683.0] + - - [1024, 3518, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 10537.0] + - - [4096, 3289, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12213.0] + - - [1024, 3440, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11961.0] + - - [1024, 4032, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11905.0] + - - [4096, 3489, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12123.0] + - - [4096, 3346, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11944.0] + - - [1024, 3534, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11384.0] + - - [1024, 3079, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11632.0] + - - [1024, 3955, 1, 4096, 1024, 1024, 1024, 4096] + - [37, 11648.0] + - - [4096, 3236, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12010.0] + - - [1024, 3545, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11409.0] + - - [1024, 3144, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11889.0] + - - [4096, 3780, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12275.0] + - - [4096, 3163, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12297.0] + - - [4096, 3468, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12049.0] + - - [1024, 3539, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11406.0] + - - [1024, 3541, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11424.0] + - - [4096, 3363, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11964.0] + - - [1024, 3475, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11582.0] + - - [4096, 3110, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12116.0] + - - [1024, 3509, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11700.0] + - - [1024, 3413, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11906.0] + - - [1024, 3975, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11288.0] + - - [4096, 3549, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12316.0] + - - [4096, 3342, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11911.0] + - - [1024, 2985, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11523.0] + - - [1024, 3876, 1, 33708, 1024, 1024, 1024, 33708] + - [8, 11457.0] + - - [4096, 3280, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12190.0] + - - [4096, 3191, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12461.0] + - - [4096, 3512, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12242.0] + - - [1024, 3560, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11468.0] + - - [4096, 2499, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12129.0] + - - [1024, 3248, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11384.0] + - - [4096, 3423, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12176.0] + - - [4096, 3297, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12278.0] + - - [4096, 3154, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12333.0] + - - [1024, 3303, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11577.0] + - - [1024, 3222, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10756.0] + - - [1024, 3978, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11310.0] + - - [4096, 3529, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12202.0] + - - [4096, 3386, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12063.0] + - - [1024, 3451, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12005.0] + - - [4096, 3562, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12336.0] + - - [4096, 3276, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12175.0] + - - [1024, 3894, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11513.0] + - - [4096, 3540, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12282.0] + - - [1024, 3416, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11911.0] + - - [1024, 4005, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11831.0] + - - [1024, 3942, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11613.0] + - - [4096, 3403, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12070.0] + - - [4096, 3381, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12046.0] + - - [1024, 3492, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11644.0] + - - [4096, 3101, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12073.0] + - - [1024, 3430, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11921.0] + - - [1024, 3977, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11695.0] + - - [1024, 3640, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 11625.0] + - - [4096, 3557, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12291.0] + - - [4096, 3414, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12162.0] + - - [1024, 3391, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11793.0] + - - [1024, 3356, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10859.0] + - - [4096, 3320, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12318.0] + - - [4096, 2765, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11993.0] + - - [1024, 3411, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11855.0] + - - [1024, 3978, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11829.0] + - - [4096, 3487, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12058.0] + - - [4096, 3520, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12233.0] + - - [4096, 3942, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12294.0] + - - [4096, 3431, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12173.0] + - - [1024, 3271, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11466.0] + - - [4096, 4020, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12089.0] + - - [1024, 3481, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11571.0] + - - [1024, 3419, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11926.0] + - - [1024, 4059, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11929.0] + - - [4096, 3345, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11889.0] + - - [4096, 3394, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12086.0] + - - [1024, 3298, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11502.0] + - - [4096, 3235, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12031.0] + - - [1024, 3681, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 11802.0] + - - [1024, 3362, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11706.0] + - - [4096, 3467, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11983.0] + - - [1024, 3349, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11667.0] + - - [1024, 3460, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10394.0] + - - [4096, 3214, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11979.0] + - - [1024, 3398, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11868.0] + - - [4096, 3478, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12042.0] + - - [1024, 4050, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11943.0] + - - [1024, 3244, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11363.0] + - - [4096, 3341, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11836.0] + - - [4096, 3454, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12229.0] + - - [1024, 3166, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11946.0] + - - [1024, 3425, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11020.0] + - - [4096, 3295, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12186.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12334.0] + - - [4096, 3822, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12360.0] + - - [1024, 3681, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 11743.0] + - - [1024, 4050, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11902.0] + - - [4096, 3495, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12110.0] + - - [4096, 3560, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12299.0] + - - [1024, 3524, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11372.0] + - - [1024, 3942, 1, 33708, 1024, 1024, 1024, 33708] + - [40, 11707.0] + - - [1024, 3304, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11507.0] + - - [1024, 3387, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10943.0] + - - [1024, 3498, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10515.0] + - - [4096, 3458, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11988.0] + - - [4096, 2967, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11919.0] + - - [4096, 3385, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12048.0] + - - [4096, 3434, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12235.0] + - - [1024, 3519, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11702.0] + - - [1024, 3511, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11735.0] + - - [1024, 3288, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11520.0] + - - [1024, 2918, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 10947.0] + - - [4096, 3573, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12357.0] + - - [1024, 3822, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 12248.0] + - - [4096, 3539, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12308.0] + - - [4096, 3332, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11890.0] + - - [4096, 3286, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12260.0] + - - [1024, 4026, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11842.0] + - - [1024, 3277, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11338.0] + - - [1024, 3471, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10404.0] + - - [4096, 3518, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12188.0] + - - [1024, 3393, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11821.0] + - - [4096, 3413, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12132.0] + - - [4096, 3303, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12266.0] + - - [1024, 3207, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11259.0] + - - [1024, 3894, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 11205.0] + - - [1024, 3977, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11251.0] + - - [4096, 3535, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12246.0] + - - [4096, 3376, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12012.0] + - - [1024, 3355, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11691.0] + - - [1024, 3466, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10478.0] + - - [4096, 3266, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12102.0] + - - [1024, 3404, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11857.0] + - - [1024, 3999, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11277.0] + - - [4096, 3498, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12124.0] + - - [1024, 4032, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11556.0] + - - [1024, 3410, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11931.0] + - - [4096, 3393, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12088.0] + - - [1024, 3140, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11802.0] + - - [1024, 3910, 1, 33708, 1024, 1024, 1024, 33708] + - [17, 11551.0] + - - [1024, 3334, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11627.0] + - - [4096, 3140, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12202.0] + - - [1024, 4005, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11780.0] + - - [1024, 3579, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11515.0] + - - [4096, 3372, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11973.0] + - - [1024, 3245, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11353.0] + - - [4096, 3956, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12432.0] + - - [4096, 3213, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12041.0] + - - [1024, 3361, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11704.0] + - - [1024, 3536, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10444.0] + - - [4096, 3477, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12015.0] + - - [4096, 3526, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12221.0] + - - [1024, 4005, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11504.0] + - - [1024, 3530, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11388.0] + - - [1024, 3944, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11654.0] + - - [4096, 3453, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12271.0] + - - [4096, 3184, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12430.0] + - - [4096, 3579, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12443.0] + - - [4096, 3351, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11975.0] + - - [4096, 3416, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12181.0] + - - [1024, 3822, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 12183.0] + - - [1024, 3796, 1, 4096, 1024, 1024, 1024, 4096] + - [41, 12124.0] + - - [4096, 3257, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12096.0] + - - [4096, 3306, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12318.0] + - - [1024, 3505, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11685.0] + - - [1024, 3315, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11543.0] + - - [1024, 3486, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 10437.0] + - - [4096, 3457, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11994.0] + - - [4096, 3870, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12117.0] + - - [1024, 3447, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12005.0] + - - [1024, 3558, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11449.0] + - - [4096, 3433, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12172.0] + - - [4096, 3180, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12398.0] + - - [1024, 3213, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11135.0] + - - [1024, 3900, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11545.0] + - - [4096, 3444, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12188.0] + - - [1024, 3504, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11654.0] + - - [4096, 4059, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12181.0] + - - [1024, 3442, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12025.0] + - - [4096, 3517, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12204.0] + - - [1024, 3566, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11506.0] + - - [4096, 3248, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12082.0] + - - [1024, 3547, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11420.0] + - - [1024, 3340, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11658.0] + - - [4096, 3480, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12077.0] + - - [4096, 3424, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12199.0] + - - [1024, 3906, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11383.0] + - - [4096, 3265, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12161.0] + - - [1024, 3384, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11789.0] + - - [1024, 3494, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10499.0] + - - [1024, 3236, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10726.0] + - - [4096, 3497, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12098.0] + - - [4096, 3354, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11942.0] + - - [4096, 3055, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12231.0] + - - [4096, 3244, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12097.0] + - - [4096, 3139, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12258.0] + - - [4096, 3508, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12185.0] + - - [4096, 4050, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12186.0] + - - [1024, 3472, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11589.0] + - - [1024, 3861, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11008.0] + - - [1024, 3910, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11136.0] + - - [4096, 3371, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11956.0] + - - [1024, 3751, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 11960.0] + - - [4096, 3325, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12296.0] + - - [1024, 3321, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11566.0] + - - [1024, 3944, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11488.0] + - - [4096, 3525, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12234.0] + - - [4096, 3382, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12031.0] + - - [1024, 3453, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12018.0] + - - [4096, 3564, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12372.0] + - - [4096, 3288, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12281.0] + - - [1024, 3925, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11615.0] + - - [1024, 3057, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11619.0] + - - [4096, 3488, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12073.0] + - - [4096, 3046, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12196.0] + - - [1024, 3189, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12094.0] + - - [4096, 3399, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12082.0] + - - [1024, 3383, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11779.0] + - - [1024, 3415, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11012.0] + - - [1024, 3388, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11002.0] + - - [1024, 3376, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10917.0] + - - [1024, 3473, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10399.0] + - - [4096, 3162, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12255.0] + - - [1024, 3448, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12000.0] + - - [4096, 3362, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11951.0] + - - [1024, 3262, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11349.0] + - - [1024, 3184, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11229.0] + - - [1024, 3378, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10962.0] + - - [4096, 3548, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12277.0] + - - [4096, 2977, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11942.0] + - - [4096, 3443, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12232.0] + - - [1024, 3289, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11536.0] + - - [1024, 3483, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10390.0] + - - [4096, 3190, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12397.0] + - - [1024, 3421, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11940.0] + - - [1024, 3514, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11641.0] + - - [1024, 3532, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11407.0] + - - [1024, 3565, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11499.0] + - - [4096, 3422, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12124.0] + - - [4096, 3263, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12136.0] + - - [4096, 3296, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12277.0] + - - [4096, 3640, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12130.0] + - - [4096, 3463, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12037.0] + - - [4096, 3528, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12216.0] + - - [1024, 3351, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11670.0] + - - [1024, 3462, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10413.0] + - - [4096, 3226, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11997.0] + - - [4096, 3439, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12240.0] + - - [4096, 3121, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12175.0] + - - [1024, 4059, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11980.0] + - - [1024, 3311, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11556.0] + - - [1024, 3230, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 10725.0] + - - [4096, 3353, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11901.0] + - - [4096, 3402, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12122.0] + - - [1024, 3427, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11945.0] + - - [1024, 3346, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11701.0] + - - [1024, 3126, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11164.0] + - - [1024, 3796, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 11525.0] + - - [1024, 3990, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11723.0] + - - [1024, 3257, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11415.0] + - - [4096, 3996, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 11973.0] + - - [1024, 3306, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11581.0] + - - [1024, 3389, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11791.0] + - - [1024, 3500, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11631.0] + - - [1024, 3999, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11820.0] + - - [4096, 3486, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12151.0] + - - [1024, 3438, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12003.0] + - - [4096, 3616, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12069.0] + - - [1024, 3955, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11528.0] + - - [4096, 3430, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12226.0] + - - [4096, 3271, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12208.0] + - - [1024, 3364, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11731.0] + - - [1024, 3497, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10492.0] + - - [4096, 3503, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12144.0] + - - [4096, 3344, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11930.0] + - - [1024, 3457, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11538.0] + - - [4096, 3466, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12043.0] + - - [1024, 3976, 1, 33708, 1024, 1024, 1024, 33708] + - [17, 11748.0] + - - [1024, 3395, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11822.0] + - - [4096, 3361, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11930.0] + - - [1024, 3751, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 12029.0] + - - [1024, 3822, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11739.0] + - - [4096, 3315, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12368.0] + - - [1024, 3163, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12007.0] + - - [4096, 3547, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12292.0] + - - [4096, 3340, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11916.0] + - - [1024, 3296, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11556.0] + - - [1024, 3468, 1, 4096, 1024, 1024, 1024, 4096] + - [25, 11205.0] + - - [4096, 3294, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12247.0] + - - [1024, 3406, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11886.0] + - - [1024, 3860, 1, 33708, 1024, 1024, 1024, 33708] + - [31, 11419.0] + - - [1024, 3584, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11544.0] + - - [4096, 3189, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12380.0] + - - [4096, 3494, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12151.0] + - - [1024, 3093, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11717.0] + - - [4096, 3421, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12166.0] + - - [1024, 3479, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11631.0] + - - [1024, 3433, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11992.0] + - - [4096, 3311, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12279.0] + - - [1024, 3381, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11791.0] + - - [1024, 3996, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11750.0] + - - [4096, 3384, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12025.0] + - - [1024, 3247, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11400.0] + - - [1024, 3169, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11358.0] + - - [1024, 3088, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10954.0] + - - [1024, 3363, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10931.0] + - - [1024, 3538, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10506.0] + - - [1024, 3996, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11476.0] + - - [4096, 3169, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12315.0] + - - [4096, 3538, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12299.0] + - - [4096, 3401, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [4096, 3581, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12436.0] + - - [1024, 3180, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12032.0] + - - [1024, 3870, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11088.0] + - - [4096, 3555, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12277.0] + - - [4096, 3412, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12154.0] + - - [4096, 3302, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12268.0] + - - [1024, 3561, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11455.0] + - - [1024, 3302, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11550.0] + - - [1024, 3976, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11715.0] + - - [4096, 3485, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12046.0] + - - [4096, 3534, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12282.0] + - - [1024, 3110, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11761.0] + - - [1024, 3401, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11071.0] + - - [4096, 3216, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11957.0] + - - [1024, 4020, 1, 33708, 1024, 1024, 1024, 33708] + - [17, 11865.0] + - - [1024, 3215, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11266.0] + - - [4096, 3566, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12320.0] + - - [1024, 3137, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11879.0] + - - [4096, 3359, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11907.0] + - - [4096, 3392, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12051.0] + - - [1024, 3506, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11691.0] + - - [4096, 3233, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12001.0] + - - [1024, 3444, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11998.0] + - - [1024, 3975, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11690.0] + - - [1024, 3870, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11441.0] + - - [4096, 3465, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12064.0] + - - [1024, 3523, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11337.0] + - - [4096, 3990, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12012.0] + - - [1024, 3549, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11436.0] + - - [1024, 3342, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11687.0] + - - [4096, 3476, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12033.0] + - - [1024, 3418, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11917.0] + - - [1024, 3859, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11025.0] + - - [4096, 3339, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11870.0] + - - [4096, 3452, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12230.0] + - - [4096, 3293, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12261.0] + - - [1024, 3369, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11741.0] + - - [1024, 3544, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 10447.0] + - - [4096, 3493, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12117.0] + - - [4096, 3350, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11929.0] + - - [4096, 3256, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12131.0] + - - [1024, 3870, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11465.0] + - - [4096, 4012, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12060.0] + - - [1024, 3280, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11466.0] + - - [4096, 3456, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12312.0] + - - [1024, 3555, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11452.0] + - - [4096, 3014, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12057.0] + - - [1024, 3474, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11613.0] + - - [4096, 3367, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11983.0] + - - [4096, 3432, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12197.0] + - - [4096, 3273, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12172.0] + - - [4096, 3130, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12183.0] + - - [1024, 2984, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11388.0] + - - [1024, 3995, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 11279.0] + - - [1024, 3517, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 10589.0] + - - [1024, 3455, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10866.0] + - - [1024, 3939, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11642.0] + - - [4096, 3147, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12243.0] + - - [4096, 3516, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12223.0] + - - [1024, 3876, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11468.0] + - - [1024, 3191, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 12028.0] + - - [4096, 3411, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12079.0] + - - [1024, 3337, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11604.0] + - - [1024, 3512, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10421.0] + - - [4096, 3301, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12244.0] + - - [1024, 3450, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12003.0] + - - [4096, 3533, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12238.0] + - - [4096, 3390, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12061.0] + - - [4096, 3231, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12033.0] + - - [1024, 2499, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11811.0] + - - [1024, 3186, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11356.0] + - - [1024, 3380, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 11059.0] + - - [4096, 3496, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12127.0] + - - [1024, 3956, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11682.0] + - - [1024, 3976, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11429.0] + - - [4096, 2736, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11897.0] + - - [1024, 3291, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11511.0] + - - [1024, 3944, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11659.0] + - - [1024, 3485, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11586.0] + - - [4096, 3138, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12172.0] + - - [1024, 3423, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11915.0] + - - [1024, 3491, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10444.0] + - - [1024, 3860, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11413.0] + - - [4096, 3211, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11925.0] + - - [1024, 3221, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11306.0] + - - [1024, 2917, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 10902.0] + - - [4096, 3475, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12054.0] + - - [4096, 3524, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12236.0] + - - [4096, 2985, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11968.0] + - - [1024, 3480, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11563.0] + - - [4096, 3222, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11977.0] + - - [4096, 3451, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12257.0] + - - [1024, 3969, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11713.0] + - - [1024, 3640, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11370.0] + - - [1024, 3297, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11510.0] + - - [4096, 3944, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12240.0] + - - [1024, 3216, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11286.0] + - - [4096, 3349, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11906.0] + - - [4096, 3398, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12087.0] + - - [1024, 3154, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11933.0] + - - [1024, 3978, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11739.0] + - - [1024, 3348, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11643.0] + - - [4096, 3304, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12226.0] + - - [4096, 4030, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12138.0] + - - [1024, 4026, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11541.0] + - - [4096, 3471, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12047.0] + - - [1024, 3259, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11414.0] + - - [1024, 3308, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10807.0] + - - [4096, 3391, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12029.0] + - - [1024, 3312, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11540.0] + - - [1024, 3502, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10400.0] + - - [1024, 3968, 1, 33708, 1024, 1024, 1024, 33708] + - [26, 11778.0] + - - [1024, 3424, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11929.0] + - - [4096, 4032, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12088.0] + - - [1024, 3900, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 11370.0] + - - [4096, 3442, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12227.0] + - - [1024, 3366, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11753.0] + - - [4096, 3999, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12055.0] + - - [1024, 3477, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11594.0] + - - [1024, 2505, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11770.0] + - - [4096, 3515, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12151.0] + - - [1024, 3564, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11477.0] + - - [4096, 3057, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12219.0] + - - [1024, 3339, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11585.0] + - - [4096, 3262, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12114.0] + - - [1024, 4030, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11838.0] + - - [1024, 3265, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11430.0] + - - [1024, 3459, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10412.0] + - - [4096, 3462, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11992.0] + - - [1024, 3513, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11736.0] + - - [1024, 3397, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11847.0] + - - [4096, 3572, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12326.0] + - - [4096, 3389, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12087.0] + - - [4096, 3438, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12251.0] + - - [1024, 3640, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 11674.0] + - - [1024, 3995, 1, 33708, 1024, 1024, 1024, 33708] + - [8, 11793.0] + - - [1024, 3165, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12002.0] + - - [4096, 3543, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12229.0] + - - [4096, 3352, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11930.0] + - - [1024, 3359, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11766.0] + - - [1024, 3470, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10442.0] + - - [1024, 3392, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 10996.0] + - - [4096, 3137, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12231.0] + - - [4096, 3506, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12167.0] + - - [1024, 3095, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11717.0] + - - [1024, 3859, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11450.0] + - - [4096, 3369, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11971.0] + - - [1024, 3435, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11994.0] + - - [1024, 3354, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11685.0] + - - [1024, 3055, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 11124.0] + - - [4096, 3523, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12210.0] + - - [4096, 3380, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12030.0] + - - [1024, 3233, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 11168.0] + - - [4096, 3221, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11946.0] + - - [4096, 3270, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12179.0] + - - [4096, 3593, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11979.0] + - - [1024, 3358, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11672.0] + - - [1024, 3540, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10500.0] + - - [4096, 3502, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12139.0] + - - [4096, 2505, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12154.0] + - - [4096, 3397, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12099.0] + - - [1024, 3300, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11525.0] + - - [4096, 3095, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12031.0] + - - [1024, 3182, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 12039.0] + - - [1024, 3299, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10910.0] + - - [1024, 3276, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 10963.0] + - - [1024, 3360, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10882.0] + - - [4096, 3360, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11928.0] + - - [4096, 2918, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12280.0] + - - [1024, 3939, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11635.0] + - - [4096, 3314, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12327.0] + - - [1024, 3319, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11564.0] + - - [1024, 3942, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11212.0] + - - [1024, 3465, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10503.0] + - - [4096, 3546, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12274.0] + - - [1024, 3403, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11838.0] + - - [1024, 3948, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 11186.0] + - - [4096, 3441, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12190.0] + - - [1024, 3139, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11854.0] + - - [1024, 3563, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11457.0] + - - [1024, 3508, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11699.0] + - - [1024, 3975, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11737.0] + - - [1024, 3446, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12002.0] + - - [1024, 3529, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10485.0] + - - [4096, 3461, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 11966.0] + - - [1024, 3574, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11519.0] + - - [1024, 3101, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11713.0] + - - [1024, 3927, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 11234.0] + - - [4096, 3224, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11973.0] + - - [4096, 3437, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12235.0] + - - [4096, 3900, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12182.0] + - - [1024, 3495, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11646.0] + - - [1024, 3977, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11748.0] + - - [1024, 3328, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11597.0] + - - [4096, 3168, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12294.0] + - - [1024, 4026, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11876.0] + - - [1024, 3292, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11500.0] + - - [1024, 3294, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10855.0] + - - [4096, 3335, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11821.0] + - - [4096, 3400, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12112.0] + - - [1024, 3287, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11492.0] + - - [1024, 3910, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11548.0] + - - [1024, 3780, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11658.0] + - - [4096, 3098, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12045.0] + - - [1024, 3584, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 11533.0] + - - [1024, 3371, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11731.0] + - - [1024, 3546, 1, 4096, 1024, 1024, 1024, 4096] + - [20, 10503.0] + - - [1024, 4012, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11376.0] + - - [4096, 3505, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12158.0] + - - [4096, 3554, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12350.0] + - - [4096, 3063, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12257.0] + - - [1024, 3900, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11524.0] + - - [1024, 3345, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11644.0] + - - [1024, 3357, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10908.0] + - - [1024, 3282, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10863.0] + - - [4096, 3484, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12011.0] + - - [1024, 3557, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11450.0] + - - [1024, 3476, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11475.0] + - - [1024, 3751, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11607.0] + - - [4096, 3379, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11975.0] + - - [4096, 3428, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12209.0] + - - [4096, 3126, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12208.0] + - - [1024, 3325, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11574.0] + - - [4096, 3501, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12120.0] + - - [4096, 3358, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11960.0] + - - [1024, 3441, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12002.0] + - - [1024, 3552, 1, 4096, 1024, 1024, 1024, 4096] + - [41, 11361.0] + - - [4096, 3232, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12040.0] + - - [1024, 3412, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11887.0] + - - [1024, 3372, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11743.0] + - - [1024, 3585, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11507.0] + - - [4096, 3143, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12239.0] + - - [4096, 3464, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12059.0] + - - [1024, 3145, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11939.0] + - - [4096, 3375, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11997.0] + - - [4096, 2917, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12288.0] + - - [4096, 3978, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11991.0] + - - [1024, 2765, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11548.0] + - - [1024, 3452, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11978.0] + - - [4096, 3584, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12384.0] + - - [4096, 3545, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12305.0] + - - [1024, 3352, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11649.0] + - - [4096, 3292, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12213.0] + - - [1024, 3525, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11356.0] + - - [1024, 3266, 1, 4096, 1024, 1024, 1024, 4096] + - [0, 11297.0] + - - [1024, 3382, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10979.0] + - - [4096, 3492, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12130.0] + - - [4096, 3419, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12179.0] + - - [1024, 3796, 1, 33708, 1024, 1024, 1024, 33708] + - [21, 12161.0] + - - [1024, 3293, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11523.0] + - - [4096, 3796, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12262.0] + - - [1024, 3487, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11615.0] + - - [4096, 3166, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12302.0] + - - [1024, 3409, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11899.0] + - - [1024, 3520, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10481.0] + - - [1024, 3573, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 10601.0] + - - [4096, 3366, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11940.0] + - - [4096, 3720, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12110.0] + - - [4096, 3207, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11933.0] + - - [4096, 3272, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12192.0] + - - [1024, 3390, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11783.0] + - - [4096, 3183, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12373.0] + - - [4096, 3536, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12264.0] + - - [4096, 3563, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12364.0] + - - [1024, 3482, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11612.0] + - - [4096, 3447, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12207.0] + - - [4096, 3955, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12347.0] + - - [4096, 4005, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12029.0] + - - [1024, 3493, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11614.0] + - - [4096, 3410, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12115.0] + - - [1024, 3422, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11919.0] + - - [1024, 3350, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10891.0] + - - [4096, 3300, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12206.0] + - - [4096, 3910, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12225.0] + - - [1024, 3489, 1, 4096, 1024, 1024, 1024, 4096] + - [22, 11650.0] + - - [4096, 3483, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12066.0] + - - [4096, 3532, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12269.0] + - - [4096, 3230, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12031.0] + - - [4096, 3427, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12203.0] + - - [1024, 3377, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11697.0] + - - [1024, 3488, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11559.0] + - - [1024, 3616, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11581.0] + - - [1024, 3426, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11910.0] + - - [4096, 3357, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11909.0] + - - [4096, 3406, 1, 1024, 4096, 4096, 4096, 1024] + - [8, 12124.0] + - - [1024, 3046, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11629.0] + - - [1024, 3272, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10818.0] + - - [1024, 3256, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10793.0] + - - [4096, 3247, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12096.0] + - - [4096, 3088, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12097.0] + - - [1024, 3531, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11388.0] + - - [4096, 3511, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12200.0] + - - [1024, 3720, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 11928.0] + - - [1024, 3267, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11443.0] + - - [1024, 3270, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10881.0] + - - [1024, 3461, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10436.0] + - - [4096, 3474, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12035.0] + - - [4096, 2984, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11944.0] + - - [1024, 3399, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11890.0] + - - [4096, 3574, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12372.0] + - - [1024, 3876, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 11303.0] + - - [4096, 3337, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11878.0] + - - [4096, 3450, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12256.0] + - - [1024, 3720, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11407.0] + - - [1024, 4059, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11670.0] + - - [4096, 3291, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12262.0] + - - [4096, 3995, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12032.0] + - - [4096, 3491, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12139.0] + - - [4096, 3348, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11951.0] + - - [4096, 3925, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12261.0] + - - [4096, 3894, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12160.0] + - - [1024, 3456, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12052.0] + - - [1024, 3394, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11051.0] + - - [4096, 3165, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12303.0] + - - [4096, 3470, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12090.0] + - - [1024, 3014, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 11545.0] + - - [1024, 3375, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 10963.0] + - - [4096, 3859, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12071.0] + - - [4096, 3365, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 11962.0] + - - [1024, 3162, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11978.0] + - - [1024, 3840, 1, 33708, 1024, 1024, 1024, 33708] + - [5, 12308.0] + - - [1024, 3437, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11959.0] + - - [4096, 3319, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12278.0] + - - [1024, 3320, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11601.0] + - - [4096, 3328, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12409.0] + - - [1024, 3235, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11338.0] + - - [4096, 3282, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12184.0] + - - [1024, 3367, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11777.0] + - - [1024, 3542, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11403.0] + - - [4096, 3145, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12212.0] + - - [4096, 3514, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12204.0] + - - [1024, 3432, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 11950.0] + - - [4096, 3409, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12105.0] + - - [1024, 4012, 1, 33708, 1024, 1024, 1024, 33708] + - [1, 11843.0] + - - [4096, 3876, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12132.0] + - - [4096, 3299, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12289.0] + - - [1024, 3168, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11991.0] + - - [4096, 3681, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12222.0] + - - [4096, 3531, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12248.0] + - - [4096, 3388, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12058.0] + - - [1024, 3720, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 11857.0] + - - [1024, 3332, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11626.0] + - - [1024, 3273, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10925.0] + - - [1024, 2935, 1, 4096, 1024, 1024, 1024, 4096] + - [28, 11054.0] + - - [1024, 3467, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 10429.0] + - - [4096, 3542, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12266.0] + - - [1024, 3130, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11832.0] + - - [1024, 3405, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11005.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11522.0] + - - [4096, 3405, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12105.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12183.0] + - - [36548, 1216, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 11926.0] + - - [1024, 2592, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10940.0] + - - [1024, 1568, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10917.0] + - - [1024, 4445, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11917.0] + - - [1024, 6272, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 12010.0] + - - [36548, 3584, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12434.0] + - - [1024, 1827, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10799.0] + - - [1024, 3220, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11124.0] + - - [1024, 1856, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11115.0] + - - [1024, 1760, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 10327.0] + - - [36548, 4235, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12138.0] + - - [1024, 1984, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10947.0] + - - [1024, 14720, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 12347.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10187.0] + - - [36548, 14976, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12361.0] + - - [36548, 1152, 1, 1024, 36548, 36548, 36548, 1024] + - [20, 12478.0] + - - [1024, 3392, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11469.0] + - - [1024, 1408, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10864.0] + - - [1024, 2080, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10578.0] + - - [1024, 1824, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10629.0] + - - [36548, 2432, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12577.0] + - - [36548, 1827, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 11958.0] + - - [1024, 10176, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12349.0] + - - [1024, 1952, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10754.0] + - - [1024, 17024, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 12265.0] + - - [1024, 1472, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10416.0] + - - [36548, 4459, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12425.0] + - - [1024, 3712, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11593.0] + - - [36548, 12928, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12336.0] + - - [1024, 1632, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 9874.0] + - - [1024, 1696, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10568.0] + - - [36548, 1764, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12340.0] + - - [1024, 2944, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11303.0] + - - [36548, 14080, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12393.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [24, 10437.0] + - - [1024, 13440, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 12351.0] + - - [36548, 9120, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12360.0] + - - [1024, 3008, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11052.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11493.0] + - - [1024, 2208, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 10676.0] + - - [1024, 1920, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 11170.0] + - - [36548, 2496, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12283.0] + - - [1024, 2016, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10979.0] + - - [1024, 1184, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10493.0] + - - [1024, 1664, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10419.0] + - - [1024, 11424, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 12150.0] + - - [1024, 1216, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10615.0] + - - [36548, 3185, 1, 1024, 36548, 36548, 36548, 1024] + - [40, 12458.0] + - - [36548, 9216, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12485.0] + - - [1024, 3200, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11216.0] + - - [1024, 2656, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10664.0] + - - [1024, 2368, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10985.0] + - - [1024, 4459, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11946.0] + - - [1024, 3808, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11810.0] + - - [1024, 2336, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11120.0] + - - [1024, 2304, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10807.0] + - - [1024, 1560, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10579.0] + - - [1024, 2496, 1, 1024, 1024, 1024, 1024, 1024] + - [26, 11137.0] + - - [1024, 1504, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 10341.0] + - - [1024, 3232, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10967.0] + - - [36548, 1015, 1, 1024, 36548, 36548, 36548, 1024] + - [37, 12364.0] + - - [1024, 2000, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10923.0] + - - [36548, 243, 1, 1024, 36548, 36548, 36548, 1024] + - [20, 11494.0] + - - [1024, 13184, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12302.0] + - - [1024, 2688, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11461.0] + - - [36548, 950, 1, 1024, 36548, 36548, 36548, 1024] + - [16, 11788.0] + - - [1024, 1764, 1, 1024, 1024, 1024, 1024, 1024] + - [9, 10544.0] + - - [1024, 1376, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10552.0] + - - [36548, 774, 1, 1024, 36548, 36548, 36548, 1024] + - [30, 11076.0] + - - [1024, 4256, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11519.0] + - - [36548, 3712, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12445.0] + - - [1024, 3360, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11271.0] + - - [1024, 2784, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11347.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11901.0] + - - [36548, 1102, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 11962.0] + - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11003.0] + - - [1024, 2720, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11003.0] + - - [1024, 2752, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11224.0] + - - [1024, 2816, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11626.0] + - - [1024, 2624, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11184.0] + - - [1024, 2144, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11091.0] + - - [36548, 1131, 1, 1024, 36548, 36548, 36548, 1024] + - [20, 12265.0] + - - [1024, 3296, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11355.0] + - - [36548, 4992, 1, 1024, 36548, 36548, 36548, 1024] + - [40, 12525.0] + - - [1024, 1344, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10378.0] + - - [36548, 2401, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12416.0] + - - [1024, 15744, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12367.0] + - - [1024, 15232, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12423.0] + - - [1024, 1888, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11300.0] + - - [1024, 1792, 1, 1024, 1024, 1024, 1024, 1024] + - [5, 10840.0] + - - [36548, 1073, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 11661.0] + - - [36548, 15488, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12358.0] + - - [1024, 2464, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 10459.0] + - - [1024, 2272, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 10573.0] + - - [1024, 2432, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 10979.0] + - - [1024, 3936, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11426.0] + - - [36548, 13824, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12314.0] + - - [1024, 2401, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 10333.0] + - - [1024, 2176, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11087.0] + - - [1024, 2240, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 11170.0] + - - [1024, 1728, 1, 1024, 1024, 1024, 1024, 1024] + - [22, 10775.0] + - - [1024, 2528, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11271.0] + - - [1024, 2400, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 10795.0] + - - [1024, 1440, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 9971.0] + - - [1024, 2912, 1, 1024, 1024, 1024, 1024, 1024] + - [36, 10838.0] + - - [1024, 2880, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11232.0] + - - [1024, 4064, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11523.0] + - - [1024, 4655, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11745.0] + - - [36548, 6272, 1, 1024, 36548, 36548, 36548, 1024] + - [23, 12521.0] + - - [768, 2048, 1, 3072, 768, 768, 768, 3072] + - [8, 11257.0] + - - [768, 4096, 1, 3072, 768, 768, 768, 3072] + - [8, 11821.0] + - - [6272, 256, 1, 528, 6272, 6272, 6272, 528] + - [1, 11319.0] + - - [3136, 2048, 1, 1024, 3136, 3136, 3136, 1024] + - [23, 12041.0] + - - [50176, 128, 1, 256, 50176, 50176, 50176, 256] + - [3, 11556.0] + - - [12544, 1024, 1, 256, 12544, 12544, 12544, 256] + - [8, 11954.0] + - - [12544, 256, 1, 1024, 12544, 12544, 12544, 1024] + - [1, 11845.0] + - - [3136, 512, 1, 1024, 3136, 3136, 3136, 1024] + - [2, 11057.0] + - - [3136, 2048, 1, 512, 3136, 3136, 3136, 512] + - [1, 11793.0] + - - [289, 384, 32, 1024, 289, 289, 289, 1024] + - [11, 8655.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 11464.0] + - - [50176, 512, 1, 256, 50176, 50176, 50176, 256] + - [3, 12228.0] + - - [12544, 1024, 1, 512, 12544, 12544, 12544, 512] + - [1, 12221.0] + - - [12544, 256, 1, 512, 12544, 12544, 12544, 512] + - [1, 11761.0] + - - [784, 128, 32, 256, 784, 784, 784, 256] + - [0, 9647.0] + - - [4096, 512, 1, 9216, 4096, 4096, 4096, 9216] + - [22, 11356.0] + - - [3136, 512, 1, 2048, 3136, 3136, 3136, 2048] + - [2, 11450.0] + - - [1225, 192, 32, 384, 1225, 1225, 1225, 384] + - [16, 11023.0] + - - [8192, 320, 1, 1280, 8192, 8192, 8192, 1280] + - [16, 11545.0] + - - [8192, 320, 1, 2048, 8192, 8192, 8192, 2048] + - [16, 11635.0] + - - [8192, 384, 1, 1280, 8192, 8192, 8192, 1280] + - [1, 11696.0] + - - [8192, 384, 1, 2048, 8192, 8192, 8192, 2048] + - [1, 11746.0] + - - [8192, 448, 1, 2048, 8192, 8192, 8192, 2048] + - [16, 11485.0] + - - [8192, 448, 1, 1280, 8192, 8192, 8192, 1280] + - [16, 11467.0] + - - [256, 6400, 1, 4096, 256, 256, 256, 4096] + - [11, 11474.0] + - - [512, 3433, 1, 2048, 512, 512, 512, 2048] + - [7, 11162.0] + - - [512, 3439, 1, 2048, 512, 512, 512, 2048] + - [22, 10904.0] + - - [512, 3461, 1, 2048, 512, 512, 512, 2048] + - [7, 10944.0] + - - [512, 3479, 1, 2048, 512, 512, 512, 2048] + - [22, 11094.0] + - - [512, 3494, 1, 2048, 512, 512, 512, 2048] + - [7, 11093.0] + - - [512, 3520, 1, 2048, 512, 512, 512, 2048] + - [7, 11240.0] + - - [512, 3530, 1, 2048, 512, 512, 512, 2048] + - [28, 10767.0] + - - [512, 3541, 1, 2048, 512, 512, 512, 2048] + - [13, 10767.0] + - - [512, 3564, 1, 2048, 512, 512, 512, 2048] + - [23, 10806.0] + - - [512, 3776, 1, 2048, 512, 512, 512, 2048] + - [41, 11382.0] + - - [512, 3859, 1, 512, 512, 512, 512, 512] + - [36, 10100.0] + - - [512, 3925, 1, 2048, 512, 512, 512, 2048] + - [36, 10702.0] + - - [512, 3944, 1, 2048, 512, 512, 512, 2048] + - [22, 10730.0] + - - [512, 3955, 1, 2048, 512, 512, 512, 2048] + - [22, 10726.0] + - - [512, 3969, 1, 2048, 512, 512, 512, 2048] + - [22, 10805.0] + - - [512, 3976, 1, 2048, 512, 512, 512, 2048] + - [7, 10835.0] + - - [2048, 1232, 1, 512, 2048, 2048, 2048, 512] + - [16, 10664.0] + - - [2048, 3165, 1, 512, 2048, 2048, 2048, 512] + - [11, 11720.0] + - - [512, 2387, 1, 512, 512, 512, 512, 512] + - [9, 10151.0] + - - [512, 2418, 1, 512, 512, 512, 512, 512] + - [9, 10214.0] + - - [512, 2418, 1, 2048, 512, 512, 512, 2048] + - [28, 10722.0] + - - [512, 2496, 1, 512, 512, 512, 512, 512] + - [22, 10233.0] + - - [512, 2496, 1, 2048, 512, 512, 512, 2048] + - [28, 11023.0] + - - [512, 2790, 1, 2048, 512, 512, 512, 2048] + - [7, 10530.0] + - - [512, 2864, 1, 2048, 512, 512, 512, 2048] + - [7, 10831.0] + - - [512, 3092, 1, 2048, 512, 512, 512, 2048] + - [8, 10990.0] + - - [512, 3113, 1, 2048, 512, 512, 512, 2048] + - [8, 11097.0] + - - [512, 3137, 1, 2048, 512, 512, 512, 2048] + - [8, 11120.0] + - - [512, 3165, 1, 2048, 512, 512, 512, 2048] + - [8, 11204.0] + - - [512, 3166, 1, 2048, 512, 512, 512, 2048] + - [8, 11219.0] + - - [512, 3194, 1, 2048, 512, 512, 512, 2048] + - [8, 11286.0] + - - [512, 3219, 1, 2048, 512, 512, 512, 2048] + - [7, 10318.0] + - - [512, 3222, 1, 2048, 512, 512, 512, 2048] + - [7, 10378.0] + - - [512, 3234, 1, 2048, 512, 512, 512, 2048] + - [7, 10306.0] + - - [512, 3237, 1, 2048, 512, 512, 512, 2048] + - [7, 10334.0] + - - [512, 3242, 1, 2048, 512, 512, 512, 2048] + - [22, 10316.0] + - - [512, 3246, 1, 2048, 512, 512, 512, 2048] + - [7, 10390.0] + - - [512, 3249, 1, 2048, 512, 512, 512, 2048] + - [7, 10316.0] + - - [512, 3251, 1, 2048, 512, 512, 512, 2048] + - [7, 10336.0] + - - [512, 3257, 1, 2048, 512, 512, 512, 2048] + - [7, 10386.0] + - - [512, 3262, 1, 2048, 512, 512, 512, 2048] + - [22, 10393.0] + - - [512, 3268, 1, 2048, 512, 512, 512, 2048] + - [22, 10343.0] + - - [512, 3282, 1, 2048, 512, 512, 512, 2048] + - [7, 10449.0] + - - [512, 3286, 1, 2048, 512, 512, 512, 2048] + - [7, 10499.0] + - - [512, 3287, 1, 2048, 512, 512, 512, 2048] + - [7, 10472.0] + - - [512, 3293, 1, 2048, 512, 512, 512, 2048] + - [22, 10457.0] + - - [512, 3297, 1, 2048, 512, 512, 512, 2048] + - [7, 10517.0] + - - [512, 3307, 1, 2048, 512, 512, 512, 2048] + - [7, 10566.0] + - - [512, 3314, 1, 2048, 512, 512, 512, 2048] + - [22, 10599.0] + - - [512, 3315, 1, 2048, 512, 512, 512, 2048] + - [7, 10529.0] + - - [512, 3319, 1, 2048, 512, 512, 512, 2048] + - [7, 10598.0] + - - [512, 3322, 1, 2048, 512, 512, 512, 2048] + - [22, 10574.0] + - - [512, 3323, 1, 2048, 512, 512, 512, 2048] + - [22, 10577.0] + - - [512, 3324, 1, 2048, 512, 512, 512, 2048] + - [7, 10599.0] + - - [512, 3325, 1, 2048, 512, 512, 512, 2048] + - [22, 10768.0] + - - [512, 3327, 1, 2048, 512, 512, 512, 2048] + - [22, 10644.0] + - - [512, 3329, 1, 2048, 512, 512, 512, 2048] + - [7, 10656.0] + - - [512, 3332, 1, 2048, 512, 512, 512, 2048] + - [36, 10796.0] + - - [512, 3336, 1, 2048, 512, 512, 512, 2048] + - [22, 10568.0] + - - [512, 3339, 1, 2048, 512, 512, 512, 2048] + - [7, 10623.0] + - - [512, 3342, 1, 2048, 512, 512, 512, 2048] + - [36, 10875.0] + - - [512, 3344, 1, 2048, 512, 512, 512, 2048] + - [7, 10798.0] + - - [512, 3358, 1, 2048, 512, 512, 512, 2048] + - [22, 10680.0] + - - [512, 3360, 1, 2048, 512, 512, 512, 2048] + - [22, 10781.0] + - - [512, 3364, 1, 2048, 512, 512, 512, 2048] + - [22, 10774.0] + - - [512, 3365, 1, 2048, 512, 512, 512, 2048] + - [7, 10759.0] + - - [512, 3369, 1, 2048, 512, 512, 512, 2048] + - [7, 10802.0] + - - [512, 3370, 1, 2048, 512, 512, 512, 2048] + - [7, 10756.0] + - - [512, 3371, 1, 2048, 512, 512, 512, 2048] + - [7, 10768.0] + - - [512, 3374, 1, 2048, 512, 512, 512, 2048] + - [7, 10800.0] + - - [512, 3376, 1, 2048, 512, 512, 512, 2048] + - [22, 10812.0] + - - [512, 3377, 1, 2048, 512, 512, 512, 2048] + - [36, 10764.0] + - - [512, 3378, 1, 2048, 512, 512, 512, 2048] + - [22, 10799.0] + - - [512, 3381, 1, 2048, 512, 512, 512, 2048] + - [22, 10756.0] + - - [512, 3382, 1, 2048, 512, 512, 512, 2048] + - [7, 10806.0] + - - [512, 3383, 1, 2048, 512, 512, 512, 2048] + - [36, 10718.0] + - - [512, 3384, 1, 2048, 512, 512, 512, 2048] + - [36, 10833.0] + - - [512, 3385, 1, 2048, 512, 512, 512, 2048] + - [7, 10811.0] + - - [512, 3386, 1, 2048, 512, 512, 512, 2048] + - [7, 10818.0] + - - [512, 3388, 1, 2048, 512, 512, 512, 2048] + - [7, 10763.0] + - - [512, 3390, 1, 2048, 512, 512, 512, 2048] + - [22, 10821.0] + - - [512, 3391, 1, 2048, 512, 512, 512, 2048] + - [36, 10820.0] + - - [512, 3396, 1, 2048, 512, 512, 512, 2048] + - [7, 10849.0] + - - [512, 3399, 1, 2048, 512, 512, 512, 2048] + - [7, 10868.0] + - - [512, 3402, 1, 2048, 512, 512, 512, 2048] + - [22, 10852.0] + - - [512, 3410, 1, 2048, 512, 512, 512, 2048] + - [7, 10854.0] + - - [512, 3412, 1, 2048, 512, 512, 512, 2048] + - [22, 10903.0] + - - [512, 3414, 1, 2048, 512, 512, 512, 2048] + - [22, 10898.0] + - - [512, 3415, 1, 2048, 512, 512, 512, 2048] + - [22, 10904.0] + - - [512, 3418, 1, 2048, 512, 512, 512, 2048] + - [7, 10852.0] + - - [512, 3420, 1, 2048, 512, 512, 512, 2048] + - [7, 10919.0] + - - [512, 3422, 1, 2048, 512, 512, 512, 2048] + - [7, 10917.0] + - - [512, 3425, 1, 2048, 512, 512, 512, 2048] + - [36, 10872.0] + - - [512, 3426, 1, 2048, 512, 512, 512, 2048] + - [36, 10840.0] + - - [512, 3427, 1, 2048, 512, 512, 512, 2048] + - [7, 10961.0] + - - [512, 3428, 1, 2048, 512, 512, 512, 2048] + - [7, 10936.0] + - - [512, 3430, 1, 2048, 512, 512, 512, 2048] + - [7, 10822.0] + - - [512, 3431, 1, 2048, 512, 512, 512, 2048] + - [36, 10813.0] + - - [512, 3432, 1, 2048, 512, 512, 512, 2048] + - [36, 10771.0] + - - [512, 3438, 1, 2048, 512, 512, 512, 2048] + - [7, 10982.0] + - - [512, 3440, 1, 2048, 512, 512, 512, 2048] + - [7, 10945.0] + - - [512, 3443, 1, 2048, 512, 512, 512, 2048] + - [7, 10908.0] + - - [512, 3445, 1, 2048, 512, 512, 512, 2048] + - [22, 10935.0] + - - [512, 3447, 1, 2048, 512, 512, 512, 2048] + - [22, 10990.0] + - - [512, 3448, 1, 2048, 512, 512, 512, 2048] + - [7, 10926.0] + - - [512, 3450, 1, 2048, 512, 512, 512, 2048] + - [22, 10996.0] + - - [512, 3451, 1, 2048, 512, 512, 512, 2048] + - [7, 11005.0] + - - [512, 3452, 1, 2048, 512, 512, 512, 2048] + - [7, 11042.0] + - - [512, 3453, 1, 2048, 512, 512, 512, 2048] + - [22, 10969.0] + - - [512, 3455, 1, 2048, 512, 512, 512, 2048] + - [22, 10994.0] + - - [512, 3456, 1, 2048, 512, 512, 512, 2048] + - [22, 11025.0] + - - [512, 3457, 1, 2048, 512, 512, 512, 2048] + - [22, 10904.0] + - - [512, 3458, 1, 2048, 512, 512, 512, 2048] + - [7, 10965.0] + - - [512, 3459, 1, 2048, 512, 512, 512, 2048] + - [7, 10999.0] + - - [512, 3460, 1, 2048, 512, 512, 512, 2048] + - [7, 11022.0] + - - [512, 3462, 1, 2048, 512, 512, 512, 2048] + - [7, 10966.0] + - - [512, 3466, 1, 2048, 512, 512, 512, 2048] + - [7, 10987.0] + - - [512, 3467, 1, 2048, 512, 512, 512, 2048] + - [7, 10967.0] + - - [512, 3468, 1, 2048, 512, 512, 512, 2048] + - [22, 10996.0] + - - [512, 3470, 1, 2048, 512, 512, 512, 2048] + - [7, 10999.0] + - - [512, 3471, 1, 2048, 512, 512, 512, 2048] + - [7, 11001.0] + - - [512, 3472, 1, 2048, 512, 512, 512, 2048] + - [7, 10960.0] + - - [512, 3475, 1, 2048, 512, 512, 512, 2048] + - [7, 10970.0] + - - [512, 3476, 1, 2048, 512, 512, 512, 2048] + - [7, 11014.0] + - - [512, 3477, 1, 2048, 512, 512, 512, 2048] + - [7, 11036.0] + - - [512, 3478, 1, 2048, 512, 512, 512, 2048] + - [7, 11012.0] + - - [512, 3480, 1, 2048, 512, 512, 512, 2048] + - [36, 10949.0] + - - [512, 3481, 1, 2048, 512, 512, 512, 2048] + - [7, 10986.0] + - - [512, 3483, 1, 2048, 512, 512, 512, 2048] + - [22, 11049.0] + - - [512, 3484, 1, 2048, 512, 512, 512, 2048] + - [7, 11074.0] + - - [512, 3487, 1, 2048, 512, 512, 512, 2048] + - [7, 11065.0] + - - [512, 3489, 1, 2048, 512, 512, 512, 2048] + - [22, 11032.0] + - - [512, 3490, 1, 2048, 512, 512, 512, 2048] + - [7, 11049.0] + - - [512, 3491, 1, 2048, 512, 512, 512, 2048] + - [22, 11037.0] + - - [512, 3493, 1, 2048, 512, 512, 512, 2048] + - [22, 10982.0] + - - [512, 3495, 1, 2048, 512, 512, 512, 2048] + - [7, 11079.0] + - - [512, 3497, 1, 2048, 512, 512, 512, 2048] + - [7, 11065.0] + - - [512, 3498, 1, 2048, 512, 512, 512, 2048] + - [7, 11075.0] + - - [512, 3499, 1, 2048, 512, 512, 512, 2048] + - [7, 11060.0] + - - [512, 3501, 1, 2048, 512, 512, 512, 2048] + - [7, 11100.0] + - - [512, 3503, 1, 2048, 512, 512, 512, 2048] + - [22, 11059.0] + - - [512, 3505, 1, 2048, 512, 512, 512, 2048] + - [7, 11084.0] + - - [512, 3507, 1, 2048, 512, 512, 512, 2048] + - [7, 11031.0] + - - [512, 3508, 1, 2048, 512, 512, 512, 2048] + - [7, 11073.0] + - - [512, 3509, 1, 2048, 512, 512, 512, 2048] + - [7, 11030.0] + - - [512, 3510, 1, 2048, 512, 512, 512, 2048] + - [7, 11179.0] + - - [512, 3511, 1, 2048, 512, 512, 512, 2048] + - [7, 11197.0] + - - [512, 3513, 1, 2048, 512, 512, 512, 2048] + - [22, 11200.0] + - - [512, 3514, 1, 2048, 512, 512, 512, 2048] + - [7, 11238.0] + - - [512, 3515, 1, 2048, 512, 512, 512, 2048] + - [22, 11201.0] + - - [512, 3517, 1, 2048, 512, 512, 512, 2048] + - [7, 11154.0] + - - [512, 3518, 1, 2048, 512, 512, 512, 2048] + - [7, 11204.0] + - - [512, 3519, 1, 2048, 512, 512, 512, 2048] + - [22, 11193.0] + - - [512, 3523, 1, 2048, 512, 512, 512, 2048] + - [41, 10733.0] + - - [512, 3528, 1, 2048, 512, 512, 512, 2048] + - [41, 10735.0] + - - [512, 3529, 1, 2048, 512, 512, 512, 2048] + - [23, 10729.0] + - - [512, 3531, 1, 2048, 512, 512, 512, 2048] + - [8, 10789.0] + - - [512, 3532, 1, 2048, 512, 512, 512, 2048] + - [41, 10781.0] + - - [512, 3533, 1, 2048, 512, 512, 512, 2048] + - [23, 10736.0] + - - [512, 3534, 1, 2048, 512, 512, 512, 2048] + - [8, 10743.0] + - - [512, 3538, 1, 2048, 512, 512, 512, 2048] + - [28, 10722.0] + - - [512, 3539, 1, 2048, 512, 512, 512, 2048] + - [23, 10849.0] + - - [512, 3540, 1, 2048, 512, 512, 512, 2048] + - [28, 10725.0] + - - [512, 3547, 1, 2048, 512, 512, 512, 2048] + - [37, 10696.0] + - - [512, 3548, 1, 2048, 512, 512, 512, 2048] + - [8, 10757.0] + - - [512, 3552, 1, 2048, 512, 512, 512, 2048] + - [8, 10753.0] + - - [512, 3575, 1, 2048, 512, 512, 512, 2048] + - [13, 10812.0] + - - [512, 3598, 1, 2048, 512, 512, 512, 2048] + - [28, 10951.0] + - - [512, 3599, 1, 2048, 512, 512, 512, 2048] + - [28, 10934.0] + - - [512, 3608, 1, 2048, 512, 512, 512, 2048] + - [13, 10941.0] + - - [512, 3776, 1, 512, 512, 512, 512, 512] + - [7, 10341.0] + - - [512, 3780, 1, 512, 512, 512, 512, 512] + - [36, 10411.0] + - - [512, 3780, 1, 2048, 512, 512, 512, 2048] + - [23, 11406.0] + - - [512, 3780, 1, 33708, 512, 512, 512, 33708] + - [5, 12073.0] + - - [512, 3796, 1, 512, 512, 512, 512, 512] + - [9, 10861.0] + - - [512, 3796, 1, 2048, 512, 512, 512, 2048] + - [8, 11711.0] + - - [512, 3796, 1, 33708, 512, 512, 512, 33708] + - [5, 12122.0] + - - [512, 3822, 1, 512, 512, 512, 512, 512] + - [9, 10879.0] + - - [512, 3822, 1, 2048, 512, 512, 512, 2048] + - [8, 11734.0] + - - [512, 3822, 1, 33708, 512, 512, 512, 33708] + - [5, 12199.0] + - - [512, 3835, 1, 512, 512, 512, 512, 512] + - [9, 10927.0] + - - [512, 3835, 1, 2048, 512, 512, 512, 2048] + - [8, 11746.0] + - - [512, 3840, 1, 512, 512, 512, 512, 512] + - [7, 11197.0] + - - [512, 3840, 1, 2048, 512, 512, 512, 2048] + - [28, 11648.0] + - - [512, 3840, 1, 33708, 512, 512, 512, 33708] + - [5, 12260.0] + - - [512, 3859, 1, 2048, 512, 512, 512, 2048] + - [36, 10774.0] + - - [512, 3859, 1, 33708, 512, 512, 512, 33708] + - [30, 10894.0] + - - [512, 3864, 1, 512, 512, 512, 512, 512] + - [36, 10402.0] + - - [512, 3864, 1, 2048, 512, 512, 512, 2048] + - [22, 10786.0] + - - [512, 3870, 1, 512, 512, 512, 512, 512] + - [36, 10522.0] + - - [512, 3870, 1, 2048, 512, 512, 512, 2048] + - [36, 10810.0] + - - [512, 3870, 1, 33708, 512, 512, 512, 33708] + - [30, 10913.0] + - - [512, 3876, 1, 512, 512, 512, 512, 512] + - [36, 10451.0] + - - [512, 3876, 1, 2048, 512, 512, 512, 2048] + - [22, 10827.0] + - - [512, 3876, 1, 33708, 512, 512, 512, 33708] + - [30, 10932.0] + - - [512, 3906, 1, 512, 512, 512, 512, 512] + - [36, 10513.0] + - - [512, 3906, 1, 2048, 512, 512, 512, 2048] + - [7, 10901.0] + - - [512, 3906, 1, 33708, 512, 512, 512, 33708] + - [30, 10986.0] + - - [512, 3910, 1, 512, 512, 512, 512, 512] + - [36, 10487.0] + - - [512, 3910, 1, 2048, 512, 512, 512, 2048] + - [22, 10863.0] + - - [512, 3910, 1, 33708, 512, 512, 512, 33708] + - [36, 10991.0] + - - [512, 3925, 1, 512, 512, 512, 512, 512] + - [36, 10579.0] + - - [512, 3925, 1, 33708, 512, 512, 512, 33708] + - [36, 11037.0] + - - [512, 3927, 1, 512, 512, 512, 512, 512] + - [36, 10565.0] + - - [512, 3942, 1, 512, 512, 512, 512, 512] + - [36, 10660.0] + - - [512, 3942, 1, 2048, 512, 512, 512, 2048] + - [36, 11009.0] + - - [512, 3942, 1, 33708, 512, 512, 512, 33708] + - [30, 11065.0] + - - [512, 3944, 1, 512, 512, 512, 512, 512] + - [36, 10619.0] + - - [512, 3944, 1, 33708, 512, 512, 512, 33708] + - [30, 11088.0] + - - [512, 3955, 1, 512, 512, 512, 512, 512] + - [36, 10618.0] + - - [512, 3955, 1, 33708, 512, 512, 512, 33708] + - [30, 11125.0] + - - [512, 3968, 1, 512, 512, 512, 512, 512] + - [7, 10824.0] + - - [512, 3968, 1, 2048, 512, 512, 512, 2048] + - [36, 11047.0] + - - [512, 3968, 1, 33708, 512, 512, 512, 33708] + - [30, 11153.0] + - - [512, 3969, 1, 512, 512, 512, 512, 512] + - [36, 10656.0] + - - [512, 3969, 1, 33708, 512, 512, 512, 33708] + - [30, 11158.0] + - - [512, 3976, 1, 512, 512, 512, 512, 512] + - [36, 10723.0] + - - [512, 3976, 1, 33708, 512, 512, 512, 33708] + - [30, 11155.0] + - - [512, 3977, 1, 512, 512, 512, 512, 512] + - [36, 10697.0] + - - [512, 3977, 1, 2048, 512, 512, 512, 2048] + - [22, 11093.0] + - - [512, 3977, 1, 33708, 512, 512, 512, 33708] + - [30, 11175.0] + - - [512, 3978, 1, 512, 512, 512, 512, 512] + - [36, 10788.0] + - - [512, 3978, 1, 2048, 512, 512, 512, 2048] + - [22, 11061.0] + - - [512, 3978, 1, 33708, 512, 512, 512, 33708] + - [30, 11184.0] + - - [512, 3990, 1, 512, 512, 512, 512, 512] + - [36, 10726.0] + - - [512, 3990, 1, 2048, 512, 512, 512, 2048] + - [22, 11078.0] + - - [512, 3990, 1, 33708, 512, 512, 512, 33708] + - [30, 11212.0] + - - [512, 3995, 1, 512, 512, 512, 512, 512] + - [36, 10746.0] + - - [512, 3995, 1, 2048, 512, 512, 512, 2048] + - [36, 11138.0] + - - [512, 3995, 1, 33708, 512, 512, 512, 33708] + - [30, 11230.0] + - - [512, 3996, 1, 512, 512, 512, 512, 512] + - [36, 10748.0] + - - [512, 3996, 1, 2048, 512, 512, 512, 2048] + - [36, 11140.0] + - - [512, 3996, 1, 33708, 512, 512, 512, 33708] + - [30, 11235.0] + - - [512, 3999, 1, 512, 512, 512, 512, 512] + - [36, 10710.0] + - - [512, 3999, 1, 2048, 512, 512, 512, 2048] + - [7, 11129.0] + - - [512, 3999, 1, 33708, 512, 512, 512, 33708] + - [30, 11243.0] + - - [512, 4005, 1, 512, 512, 512, 512, 512] + - [36, 10757.0] + - - [512, 4005, 1, 2048, 512, 512, 512, 2048] + - [22, 11166.0] + - - [512, 4005, 1, 33708, 512, 512, 512, 33708] + - [30, 11251.0] + - - [512, 4012, 1, 512, 512, 512, 512, 512] + - [36, 10734.0] + - - [512, 4012, 1, 2048, 512, 512, 512, 2048] + - [36, 11192.0] + - - [512, 4012, 1, 33708, 512, 512, 512, 33708] + - [30, 11275.0] + - - [512, 4020, 1, 512, 512, 512, 512, 512] + - [36, 10777.0] + - - [512, 4020, 1, 2048, 512, 512, 512, 2048] + - [7, 11213.0] + - - [512, 4020, 1, 33708, 512, 512, 512, 33708] + - [30, 11295.0] + - - [512, 4026, 1, 512, 512, 512, 512, 512] + - [36, 10754.0] + - - [512, 4026, 1, 2048, 512, 512, 512, 2048] + - [7, 11211.0] + - - [512, 4026, 1, 33708, 512, 512, 512, 33708] + - [30, 11308.0] + - - [512, 4030, 1, 512, 512, 512, 512, 512] + - [36, 10754.0] + - - [512, 4030, 1, 2048, 512, 512, 512, 2048] + - [22, 11235.0] + - - [512, 4030, 1, 33708, 512, 512, 512, 33708] + - [30, 11330.0] + - - [512, 4032, 1, 512, 512, 512, 512, 512] + - [7, 10983.0] + - - [512, 4032, 1, 2048, 512, 512, 512, 2048] + - [7, 11234.0] + - - [512, 4032, 1, 33708, 512, 512, 512, 33708] + - [30, 11337.0] + - - [512, 4050, 1, 512, 512, 512, 512, 512] + - [36, 10836.0] + - - [512, 4059, 1, 512, 512, 512, 512, 512] + - [36, 10911.0] + - - [2048, 644, 1, 512, 2048, 2048, 2048, 512] + - [16, 9787.0] + - - [2048, 668, 1, 512, 2048, 2048, 2048, 512] + - [16, 10073.0] + - - [2048, 714, 1, 512, 2048, 2048, 2048, 512] + - [30, 8864.0] + - - [2048, 720, 1, 512, 2048, 2048, 2048, 512] + - [30, 10013.0] + - - [2048, 722, 1, 512, 2048, 2048, 2048, 512] + - [30, 10078.0] + - - [2048, 781, 1, 512, 2048, 2048, 2048, 512] + - [0, 9975.0] + - - [2048, 848, 1, 512, 2048, 2048, 2048, 512] + - [0, 10007.0] + - - [2048, 872, 1, 512, 2048, 2048, 2048, 512] + - [0, 10251.0] + - - [2048, 936, 1, 512, 2048, 2048, 2048, 512] + - [30, 9928.0] + - - [2048, 980, 1, 512, 2048, 2048, 2048, 512] + - [16, 9634.0] + - - [2048, 1139, 1, 512, 2048, 2048, 2048, 512] + - [16, 9587.0] + - - [2048, 1184, 1, 512, 2048, 2048, 2048, 512] + - [33, 9736.0] + - - [2048, 1186, 1, 512, 2048, 2048, 2048, 512] + - [16, 9981.0] + - - [2048, 1279, 1, 512, 2048, 2048, 2048, 512] + - [16, 11323.0] + - - [2048, 1290, 1, 512, 2048, 2048, 2048, 512] + - [30, 10780.0] + - - [2048, 1327, 1, 512, 2048, 2048, 2048, 512] + - [30, 11049.0] + - - [2048, 1331, 1, 512, 2048, 2048, 2048, 512] + - [30, 11010.0] + - - [2048, 1341, 1, 512, 2048, 2048, 2048, 512] + - [30, 11060.0] + - - [2048, 1350, 1, 512, 2048, 2048, 2048, 512] + - [16, 10718.0] + - - [2048, 1359, 1, 512, 2048, 2048, 2048, 512] + - [16, 10784.0] + - - [2048, 1391, 1, 512, 2048, 2048, 2048, 512] + - [16, 11031.0] + - - [2048, 1424, 1, 512, 2048, 2048, 2048, 512] + - [16, 10772.0] + - - [2048, 1458, 1, 512, 2048, 2048, 2048, 512] + - [16, 10996.0] + - - [2048, 1462, 1, 512, 2048, 2048, 2048, 512] + - [16, 11010.0] + - - [2048, 1467, 1, 512, 2048, 2048, 2048, 512] + - [16, 11044.0] + - - [2048, 1472, 1, 512, 2048, 2048, 2048, 512] + - [16, 11173.0] + - - [2048, 1520, 1, 512, 2048, 2048, 2048, 512] + - [3, 11012.0] + - - [2048, 1596, 1, 512, 2048, 2048, 2048, 512] + - [16, 11295.0] + - - [2048, 1599, 1, 512, 2048, 2048, 2048, 512] + - [16, 11279.0] + - - [2048, 1615, 1, 512, 2048, 2048, 2048, 512] + - [16, 11061.0] + - - [2048, 1680, 1, 512, 2048, 2048, 2048, 512] + - [16, 10994.0] + - - [2048, 1709, 1, 512, 2048, 2048, 2048, 512] + - [0, 11164.0] + - - [2048, 1902, 1, 512, 2048, 2048, 2048, 512] + - [20, 11364.0] + - - [2048, 1917, 1, 512, 2048, 2048, 2048, 512] + - [30, 11441.0] + - - [2048, 2076, 1, 512, 2048, 2048, 2048, 512] + - [16, 11134.0] + - - [2048, 2195, 1, 512, 2048, 2048, 2048, 512] + - [16, 11347.0] + - - [2048, 2205, 1, 512, 2048, 2048, 2048, 512] + - [16, 11436.0] + - - [2048, 2418, 1, 512, 2048, 2048, 2048, 512] + - [16, 11322.0] + - - [2048, 2496, 1, 512, 2048, 2048, 2048, 512] + - [16, 11426.0] + - - [2048, 2790, 1, 512, 2048, 2048, 2048, 512] + - [3, 11532.0] + - - [2048, 2864, 1, 512, 2048, 2048, 2048, 512] + - [16, 11511.0] + - - [2048, 3092, 1, 512, 2048, 2048, 2048, 512] + - [3, 11588.0] + - - [2048, 3113, 1, 512, 2048, 2048, 2048, 512] + - [3, 11634.0] + - - [2048, 3137, 1, 512, 2048, 2048, 2048, 512] + - [3, 11693.0] + - - [2048, 3166, 1, 512, 2048, 2048, 2048, 512] + - [3, 11737.0] + - - [2048, 3194, 1, 512, 2048, 2048, 2048, 512] + - [11, 11807.0] + - - [2048, 3219, 1, 512, 2048, 2048, 2048, 512] + - [16, 11504.0] + - - [2048, 3222, 1, 512, 2048, 2048, 2048, 512] + - [16, 11527.0] + - - [2048, 3234, 1, 512, 2048, 2048, 2048, 512] + - [16, 11552.0] + - - [2048, 3237, 1, 512, 2048, 2048, 2048, 512] + - [16, 11556.0] + - - [2048, 3242, 1, 512, 2048, 2048, 2048, 512] + - [16, 11597.0] + - - [2048, 3246, 1, 512, 2048, 2048, 2048, 512] + - [3, 11596.0] + - - [2048, 3249, 1, 512, 2048, 2048, 2048, 512] + - [3, 11635.0] + - - [2048, 3251, 1, 512, 2048, 2048, 2048, 512] + - [3, 11590.0] + - - [2048, 3257, 1, 512, 2048, 2048, 2048, 512] + - [3, 11601.0] + - - [2048, 3262, 1, 512, 2048, 2048, 2048, 512] + - [16, 11612.0] + - - [2048, 3268, 1, 512, 2048, 2048, 2048, 512] + - [3, 11714.0] + - - [2048, 3282, 1, 512, 2048, 2048, 2048, 512] + - [3, 11702.0] + - - [2048, 3286, 1, 512, 2048, 2048, 2048, 512] + - [3, 11693.0] + - - [2048, 3287, 1, 512, 2048, 2048, 2048, 512] + - [3, 11719.0] + - - [2048, 3293, 1, 512, 2048, 2048, 2048, 512] + - [3, 11734.0] + - - [2048, 3297, 1, 512, 2048, 2048, 2048, 512] + - [1, 11710.0] + - - [2048, 3307, 1, 512, 2048, 2048, 2048, 512] + - [3, 11752.0] + - - [2048, 3314, 1, 512, 2048, 2048, 2048, 512] + - [3, 11763.0] + - - [2048, 3315, 1, 512, 2048, 2048, 2048, 512] + - [3, 11794.0] + - - [2048, 3319, 1, 512, 2048, 2048, 2048, 512] + - [3, 11814.0] + - - [2048, 3322, 1, 512, 2048, 2048, 2048, 512] + - [1, 11766.0] + - - [2048, 3323, 1, 512, 2048, 2048, 2048, 512] + - [3, 11778.0] + - - [2048, 3324, 1, 512, 2048, 2048, 2048, 512] + - [11, 11765.0] + - - [2048, 3325, 1, 512, 2048, 2048, 2048, 512] + - [3, 11774.0] + - - [2048, 3327, 1, 512, 2048, 2048, 2048, 512] + - [3, 11788.0] + - - [2048, 3329, 1, 512, 2048, 2048, 2048, 512] + - [3, 11447.0] + - - [2048, 3332, 1, 512, 2048, 2048, 2048, 512] + - [3, 11468.0] + - - [2048, 3336, 1, 512, 2048, 2048, 2048, 512] + - [3, 11468.0] + - - [2048, 3339, 1, 512, 2048, 2048, 2048, 512] + - [3, 11479.0] + - - [2048, 3342, 1, 512, 2048, 2048, 2048, 512] + - [3, 11496.0] + - - [2048, 3344, 1, 512, 2048, 2048, 2048, 512] + - [3, 11507.0] + - - [2048, 3358, 1, 512, 2048, 2048, 2048, 512] + - [3, 11530.0] + - - [2048, 3360, 1, 512, 2048, 2048, 2048, 512] + - [3, 11564.0] + - - [2048, 3364, 1, 512, 2048, 2048, 2048, 512] + - [3, 11570.0] + - - [2048, 3365, 1, 512, 2048, 2048, 2048, 512] + - [3, 11566.0] + - - [2048, 3369, 1, 512, 2048, 2048, 2048, 512] + - [3, 11588.0] + - - [2048, 3370, 1, 512, 2048, 2048, 2048, 512] + - [3, 11600.0] + - - [2048, 3371, 1, 512, 2048, 2048, 2048, 512] + - [1, 11582.0] + - - [2048, 3374, 1, 512, 2048, 2048, 2048, 512] + - [3, 11591.0] + - - [2048, 3376, 1, 512, 2048, 2048, 2048, 512] + - [3, 11602.0] + - - [2048, 3377, 1, 512, 2048, 2048, 2048, 512] + - [3, 11607.0] + - - [2048, 3378, 1, 512, 2048, 2048, 2048, 512] + - [3, 11615.0] + - - [2048, 3381, 1, 512, 2048, 2048, 2048, 512] + - [3, 11637.0] + - - [2048, 3382, 1, 512, 2048, 2048, 2048, 512] + - [3, 11610.0] + - - [2048, 3383, 1, 512, 2048, 2048, 2048, 512] + - [3, 11599.0] + - - [2048, 3384, 1, 512, 2048, 2048, 2048, 512] + - [3, 11642.0] + - - [2048, 3385, 1, 512, 2048, 2048, 2048, 512] + - [3, 11639.0] + - - [2048, 3386, 1, 512, 2048, 2048, 2048, 512] + - [3, 11633.0] + - - [2048, 3388, 1, 512, 2048, 2048, 2048, 512] + - [3, 11637.0] + - - [2048, 3390, 1, 512, 2048, 2048, 2048, 512] + - [1, 11646.0] + - - [2048, 3391, 1, 512, 2048, 2048, 2048, 512] + - [3, 11659.0] + - - [2048, 3396, 1, 512, 2048, 2048, 2048, 512] + - [3, 11684.0] + - - [2048, 3399, 1, 512, 2048, 2048, 2048, 512] + - [3, 11667.0] + - - [2048, 3402, 1, 512, 2048, 2048, 2048, 512] + - [3, 11692.0] + - - [2048, 3410, 1, 512, 2048, 2048, 2048, 512] + - [3, 11705.0] + - - [2048, 3412, 1, 512, 2048, 2048, 2048, 512] + - [3, 11723.0] + - - [2048, 3414, 1, 512, 2048, 2048, 2048, 512] + - [3, 11724.0] + - - [2048, 3415, 1, 512, 2048, 2048, 2048, 512] + - [3, 11738.0] + - - [2048, 3418, 1, 512, 2048, 2048, 2048, 512] + - [3, 11756.0] + - - [2048, 3420, 1, 512, 2048, 2048, 2048, 512] + - [3, 11743.0] + - - [2048, 3422, 1, 512, 2048, 2048, 2048, 512] + - [1, 11763.0] + - - [2048, 3425, 1, 512, 2048, 2048, 2048, 512] + - [1, 11781.0] + - - [2048, 3426, 1, 512, 2048, 2048, 2048, 512] + - [1, 11740.0] + - - [2048, 3427, 1, 512, 2048, 2048, 2048, 512] + - [1, 11766.0] + - - [2048, 3428, 1, 512, 2048, 2048, 2048, 512] + - [1, 11826.0] + - - [2048, 3430, 1, 512, 2048, 2048, 2048, 512] + - [1, 11802.0] + - - [2048, 3431, 1, 512, 2048, 2048, 2048, 512] + - [1, 11783.0] + - - [2048, 3432, 1, 512, 2048, 2048, 2048, 512] + - [3, 11786.0] + - - [2048, 3433, 1, 512, 2048, 2048, 2048, 512] + - [3, 11789.0] + - - [2048, 3438, 1, 512, 2048, 2048, 2048, 512] + - [3, 11786.0] + - - [2048, 3439, 1, 512, 2048, 2048, 2048, 512] + - [1, 11861.0] + - - [2048, 3440, 1, 512, 2048, 2048, 2048, 512] + - [3, 11797.0] + - - [2048, 3443, 1, 512, 2048, 2048, 2048, 512] + - [3, 11816.0] + - - [2048, 3445, 1, 512, 2048, 2048, 2048, 512] + - [3, 11813.0] + - - [2048, 3447, 1, 512, 2048, 2048, 2048, 512] + - [3, 11811.0] + - - [2048, 3448, 1, 512, 2048, 2048, 2048, 512] + - [3, 11815.0] + - - [2048, 3450, 1, 512, 2048, 2048, 2048, 512] + - [3, 11806.0] + - - [2048, 3451, 1, 512, 2048, 2048, 2048, 512] + - [1, 11823.0] + - - [2048, 3452, 1, 512, 2048, 2048, 2048, 512] + - [1, 11826.0] + - - [2048, 3453, 1, 512, 2048, 2048, 2048, 512] + - [3, 11830.0] + - - [2048, 3455, 1, 512, 2048, 2048, 2048, 512] + - [3, 11861.0] + - - [2048, 3456, 1, 512, 2048, 2048, 2048, 512] + - [3, 11951.0] + - - [2048, 3457, 1, 512, 2048, 2048, 2048, 512] + - [16, 11517.0] + - - [2048, 3458, 1, 512, 2048, 2048, 2048, 512] + - [16, 11506.0] + - - [2048, 3459, 1, 512, 2048, 2048, 2048, 512] + - [16, 11506.0] + - - [2048, 3460, 1, 512, 2048, 2048, 2048, 512] + - [16, 11534.0] + - - [2048, 3461, 1, 512, 2048, 2048, 2048, 512] + - [16, 11524.0] + - - [2048, 3462, 1, 512, 2048, 2048, 2048, 512] + - [16, 11511.0] + - - [2048, 3466, 1, 512, 2048, 2048, 2048, 512] + - [16, 11557.0] + - - [2048, 3467, 1, 512, 2048, 2048, 2048, 512] + - [16, 11551.0] + - - [2048, 3468, 1, 512, 2048, 2048, 2048, 512] + - [0, 11527.0] + - - [2048, 3470, 1, 512, 2048, 2048, 2048, 512] + - [16, 11558.0] + - - [2048, 3471, 1, 512, 2048, 2048, 2048, 512] + - [16, 11549.0] + - - [2048, 3472, 1, 512, 2048, 2048, 2048, 512] + - [16, 11549.0] + - - [2048, 3475, 1, 512, 2048, 2048, 2048, 512] + - [16, 11554.0] + - - [2048, 3476, 1, 512, 2048, 2048, 2048, 512] + - [16, 11575.0] + - - [2048, 3477, 1, 512, 2048, 2048, 2048, 512] + - [16, 11594.0] + - - [2048, 3478, 1, 512, 2048, 2048, 2048, 512] + - [16, 11575.0] + - - [2048, 3479, 1, 512, 2048, 2048, 2048, 512] + - [22, 11564.0] + - - [2048, 3480, 1, 512, 2048, 2048, 2048, 512] + - [0, 11567.0] + - - [2048, 3481, 1, 512, 2048, 2048, 2048, 512] + - [16, 11590.0] + - - [2048, 3483, 1, 512, 2048, 2048, 2048, 512] + - [16, 11585.0] + - - [2048, 3484, 1, 512, 2048, 2048, 2048, 512] + - [0, 11581.0] + - - [2048, 3487, 1, 512, 2048, 2048, 2048, 512] + - [16, 11588.0] + - - [2048, 3489, 1, 512, 2048, 2048, 2048, 512] + - [16, 11617.0] + - - [2048, 3490, 1, 512, 2048, 2048, 2048, 512] + - [16, 11626.0] + - - [2048, 3491, 1, 512, 2048, 2048, 2048, 512] + - [0, 11593.0] + - - [2048, 3493, 1, 512, 2048, 2048, 2048, 512] + - [16, 11613.0] + - - [2048, 3494, 1, 512, 2048, 2048, 2048, 512] + - [16, 11604.0] + - - [2048, 3495, 1, 512, 2048, 2048, 2048, 512] + - [16, 11616.0] + - - [2048, 3497, 1, 512, 2048, 2048, 2048, 512] + - [16, 11611.0] + - - [2048, 3498, 1, 512, 2048, 2048, 2048, 512] + - [30, 11610.0] + - - [2048, 3501, 1, 512, 2048, 2048, 2048, 512] + - [16, 11630.0] + - - [2048, 3503, 1, 512, 2048, 2048, 2048, 512] + - [16, 11627.0] + - - [2048, 3505, 1, 512, 2048, 2048, 2048, 512] + - [16, 11651.0] + - - [2048, 3507, 1, 512, 2048, 2048, 2048, 512] + - [16, 11659.0] + - - [2048, 3508, 1, 512, 2048, 2048, 2048, 512] + - [16, 11663.0] + - - [2048, 3509, 1, 512, 2048, 2048, 2048, 512] + - [16, 11682.0] + - - [2048, 3510, 1, 512, 2048, 2048, 2048, 512] + - [16, 11669.0] + - - [2048, 3511, 1, 512, 2048, 2048, 2048, 512] + - [16, 11658.0] + - - [2048, 3513, 1, 512, 2048, 2048, 2048, 512] + - [16, 11682.0] + - - [2048, 3514, 1, 512, 2048, 2048, 2048, 512] + - [16, 11652.0] + - - [2048, 3515, 1, 512, 2048, 2048, 2048, 512] + - [16, 11667.0] + - - [2048, 3517, 1, 512, 2048, 2048, 2048, 512] + - [16, 11681.0] + - - [2048, 3518, 1, 512, 2048, 2048, 2048, 512] + - [16, 11680.0] + - - [2048, 3519, 1, 512, 2048, 2048, 2048, 512] + - [16, 11685.0] + - - [2048, 3520, 1, 512, 2048, 2048, 2048, 512] + - [0, 11738.0] + - - [2048, 3523, 1, 512, 2048, 2048, 2048, 512] + - [3, 11620.0] + - - [2048, 3528, 1, 512, 2048, 2048, 2048, 512] + - [3, 11633.0] + - - [2048, 3529, 1, 512, 2048, 2048, 2048, 512] + - [3, 11646.0] + - - [2048, 3530, 1, 512, 2048, 2048, 2048, 512] + - [1, 11629.0] + - - [2048, 3531, 1, 512, 2048, 2048, 2048, 512] + - [3, 11651.0] + - - [2048, 3532, 1, 512, 2048, 2048, 2048, 512] + - [3, 11634.0] + - - [2048, 3533, 1, 512, 2048, 2048, 2048, 512] + - [3, 11638.0] + - - [2048, 3534, 1, 512, 2048, 2048, 2048, 512] + - [3, 11643.0] + - - [2048, 3538, 1, 512, 2048, 2048, 2048, 512] + - [3, 11649.0] + - - [2048, 3539, 1, 512, 2048, 2048, 2048, 512] + - [3, 11669.0] + - - [2048, 3540, 1, 512, 2048, 2048, 2048, 512] + - [3, 11669.0] + - - [2048, 3541, 1, 512, 2048, 2048, 2048, 512] + - [3, 11672.0] + - - [2048, 3547, 1, 512, 2048, 2048, 2048, 512] + - [3, 11691.0] + - - [2048, 3548, 1, 512, 2048, 2048, 2048, 512] + - [3, 11688.0] + - - [2048, 3552, 1, 512, 2048, 2048, 2048, 512] + - [3, 11692.0] + - - [2048, 3564, 1, 512, 2048, 2048, 2048, 512] + - [3, 11733.0] + - - [2048, 3575, 1, 512, 2048, 2048, 2048, 512] + - [3, 11755.0] + - - [2048, 3598, 1, 512, 2048, 2048, 2048, 512] + - [0, 11477.0] + - - [2048, 3599, 1, 512, 2048, 2048, 2048, 512] + - [16, 11468.0] + - - [2048, 3608, 1, 512, 2048, 2048, 2048, 512] + - [16, 11497.0] + - - [2048, 3776, 1, 512, 2048, 2048, 2048, 512] + - [3, 11809.0] + - - [2048, 3780, 1, 512, 2048, 2048, 2048, 512] + - [3, 11838.0] + - - [2048, 3796, 1, 512, 2048, 2048, 2048, 512] + - [3, 11849.0] + - - [2048, 3822, 1, 512, 2048, 2048, 2048, 512] + - [3, 11860.0] + - - [2048, 3835, 1, 512, 2048, 2048, 2048, 512] + - [3, 11910.0] + - - [2048, 3840, 1, 512, 2048, 2048, 2048, 512] + - [3, 12162.0] + - - [2048, 3859, 1, 512, 2048, 2048, 2048, 512] + - [1, 11713.0] + - - [2048, 3864, 1, 512, 2048, 2048, 2048, 512] + - [3, 11699.0] + - - [2048, 3870, 1, 512, 2048, 2048, 2048, 512] + - [1, 11735.0] + - - [2048, 3876, 1, 512, 2048, 2048, 2048, 512] + - [1, 11779.0] + - - [2048, 3906, 1, 512, 2048, 2048, 2048, 512] + - [3, 11815.0] + - - [2048, 3910, 1, 512, 2048, 2048, 2048, 512] + - [1, 11868.0] + - - [2048, 3925, 1, 512, 2048, 2048, 2048, 512] + - [3, 11861.0] + - - [2048, 3942, 1, 512, 2048, 2048, 2048, 512] + - [1, 11885.0] + - - [2048, 3944, 1, 512, 2048, 2048, 2048, 512] + - [3, 11889.0] + - - [2048, 3955, 1, 512, 2048, 2048, 2048, 512] + - [3, 11933.0] + - - [2048, 3968, 1, 512, 2048, 2048, 2048, 512] + - [3, 12104.0] + - - [2048, 3969, 1, 512, 2048, 2048, 2048, 512] + - [3, 11638.0] + - - [2048, 3976, 1, 512, 2048, 2048, 2048, 512] + - [1, 11620.0] + - - [2048, 3977, 1, 512, 2048, 2048, 2048, 512] + - [3, 11626.0] + - - [2048, 3978, 1, 512, 2048, 2048, 2048, 512] + - [3, 11635.0] + - - [2048, 3990, 1, 512, 2048, 2048, 2048, 512] + - [1, 11695.0] + - - [2048, 3995, 1, 512, 2048, 2048, 2048, 512] + - [1, 11682.0] + - - [2048, 3996, 1, 512, 2048, 2048, 2048, 512] + - [3, 11677.0] + - - [2048, 3999, 1, 512, 2048, 2048, 2048, 512] + - [3, 11691.0] + - - [2048, 4005, 1, 512, 2048, 2048, 2048, 512] + - [3, 11697.0] + - - [2048, 4012, 1, 512, 2048, 2048, 2048, 512] + - [3, 11735.0] + - - [2048, 4020, 1, 512, 2048, 2048, 2048, 512] + - [1, 11780.0] + - - [2048, 4026, 1, 512, 2048, 2048, 2048, 512] + - [1, 11799.0] + - - [2048, 4030, 1, 512, 2048, 2048, 2048, 512] + - [3, 11800.0] + - - [2048, 4032, 1, 512, 2048, 2048, 2048, 512] + - [3, 11789.0] + - - [1024, 4096, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 12127.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 12004.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 1024, 4096] + - [13, 12264.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11580.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11707.0] + - - [1024, 3968, 1, 42720, 1024, 1024, 1024, 42720] + - [23, 11744.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11936.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12112.0] + - - [1024, 7200, 1, 42720, 1024, 1024, 1024, 42720] + - [17, 12148.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12041.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12146.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12230.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 12319.0] + - - [1024, 9520, 1, 42720, 1024, 1024, 1024, 42720] + - [1, 12223.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12314.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12365.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12503.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12445.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12305.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12447.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12474.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12527.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 1024, 4096] + - [7, 11429.0] + - - [1024, 2048, 1, 30528, 1024, 1024, 1024, 30528] + - [7, 11467.0] + - - [1024, 4096, 1, 30528, 1024, 1024, 1024, 30528] + - [1, 12082.0] + - - [1024, 10240, 1, 256, 1024, 1024, 1024, 256] + - [3, 11814.0] + - - [1024, 10496, 1, 256, 1024, 1024, 1024, 256] + - [3, 11918.0] + - - [1024, 11008, 1, 256, 1024, 1024, 1024, 256] + - [3, 11870.0] + - - [1024, 11264, 1, 256, 1024, 1024, 1024, 256] + - [1, 11829.0] + - - [1024, 11520, 1, 256, 1024, 1024, 1024, 256] + - [3, 12063.0] + - - [1024, 12288, 1, 256, 1024, 1024, 1024, 256] + - [3, 11945.0] + - - [1024, 13312, 1, 256, 1024, 1024, 1024, 256] + - [3, 12040.0] + - - [1024, 13568, 1, 256, 1024, 1024, 1024, 256] + - [1, 11987.0] + - - [1024, 14336, 1, 256, 1024, 1024, 1024, 256] + - [3, 12089.0] + - - [1024, 14592, 1, 256, 1024, 1024, 1024, 256] + - [3, 12045.0] + - - [1024, 14848, 1, 256, 1024, 1024, 1024, 256] + - [1, 12024.0] + - - [1024, 15104, 1, 256, 1024, 1024, 1024, 256] + - [26, 12027.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [7, 11308.0] + - - [1024, 1600, 1, 1, 1024, 1024, 1024, 1] + - [22, 175.0] + - - [1024, 16128, 1, 256, 1024, 1024, 1024, 256] + - [11, 12114.0] + - - [1024, 17152, 1, 256, 1024, 1024, 1024, 256] + - [1, 12188.0] + - - [1024, 1792, 1, 256, 1024, 1024, 1024, 256] + - [1, 10388.0] + - - [1024, 18944, 1, 256, 1024, 1024, 1024, 256] + - [11, 12104.0] + - - [1024, 19712, 1, 256, 1024, 1024, 1024, 256] + - [3, 12167.0] + - - [1024, 19968, 1, 256, 1024, 1024, 1024, 256] + - [1, 12173.0] + - - [1024, 20480, 1, 256, 1024, 1024, 1024, 256] + - [3, 12213.0] + - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] + - [16, 10864.0] + - - [1024, 20992, 1, 256, 1024, 1024, 1024, 256] + - [3, 12179.0] + - - [1024, 21504, 1, 256, 1024, 1024, 1024, 256] + - [3, 12127.0] + - - [1024, 22016, 1, 256, 1024, 1024, 1024, 256] + - [11, 12184.0] + - - [1024, 23552, 1, 256, 1024, 1024, 1024, 256] + - [3, 12176.0] + - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] + - [16, 11133.0] + - - [1024, 28672, 1, 256, 1024, 1024, 1024, 256] + - [3, 12253.0] + - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] + - [1, 10891.0] + - - [1024, 3328, 1, 256, 1024, 1024, 1024, 256] + - [0, 11193.0] + - - [1024, 33536, 1, 256, 1024, 1024, 1024, 256] + - [3, 12299.0] + - - [1024, 3840, 1, 256, 1024, 1024, 1024, 256] + - [1, 11442.0] + - - [1024, 40448, 1, 256, 1024, 1024, 1024, 256] + - [3, 12330.0] + - - [1024, 4096, 1, 256, 1024, 1024, 1024, 256] + - [1, 11272.0] + - - [1024, 4608, 1, 256, 1024, 1024, 1024, 256] + - [0, 11387.0] + - - [1024, 4864, 1, 256, 1024, 1024, 1024, 256] + - [0, 11264.0] + - - [1024, 5120, 1, 256, 1024, 1024, 1024, 256] + - [11, 11635.0] + - - [1024, 5632, 1, 256, 1024, 1024, 1024, 256] + - [1, 11495.0] + - - [1024, 6144, 1, 256, 1024, 1024, 1024, 256] + - [3, 11426.0] + - - [1024, 6400, 1, 256, 1024, 1024, 1024, 256] + - [3, 11778.0] + - - [1024, 7168, 1, 256, 1024, 1024, 1024, 256] + - [1, 11606.0] + - - [1024, 7424, 1, 256, 1024, 1024, 1024, 256] + - [3, 11579.0] + - - [1024, 7680, 1, 256, 1024, 1024, 1024, 256] + - [3, 11890.0] + - - [1024, 7936, 1, 256, 1024, 1024, 1024, 256] + - [3, 11835.0] + - - [1024, 8192, 1, 256, 1024, 1024, 1024, 256] + - [3, 11757.0] + - - [1024, 8448, 1, 256, 1024, 1024, 1024, 256] + - [3, 11740.0] + - - [1024, 8704, 1, 256, 1024, 1024, 1024, 256] + - [1, 11729.0] + - - [1024, 8960, 1, 256, 1024, 1024, 1024, 256] + - [3, 11975.0] + - - [1024, 9728, 1, 256, 1024, 1024, 1024, 256] + - [3, 11828.0] + - - [1024, 9984, 1, 256, 1024, 1024, 1024, 256] + - [3, 11804.0] + - - [2048, 1024, 1, 1, 2048, 2048, 2048, 1] + - [16, 183.0] + - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] + - [0, 10948.0] + - - [256, 8976, 1, 10240, 256, 256, 256, 10240] + - [28, 10666.0] + - - [256, 8976, 1, 10496, 256, 256, 256, 10496] + - [7, 10907.0] + - - [256, 8976, 1, 11008, 256, 256, 256, 11008] + - [13, 10839.0] + - - [256, 8976, 1, 11520, 256, 256, 256, 11520] + - [13, 10842.0] + - - [256, 8976, 1, 12288, 256, 256, 256, 12288] + - [26, 10658.0] + - - [256, 8976, 1, 14336, 256, 256, 256, 14336] + - [26, 10672.0] + - - [256, 8976, 1, 14848, 256, 256, 256, 14848] + - [28, 10850.0] + - - [256, 8976, 1, 15104, 256, 256, 256, 15104] + - [7, 10933.0] + - - [256, 8976, 1, 1536, 256, 256, 256, 1536] + - [36, 10842.0] + - - [256, 8976, 1, 15872, 256, 256, 256, 15872] + - [13, 10846.0] + - - [256, 8976, 1, 17152, 256, 256, 256, 17152] + - [7, 10956.0] + - - [256, 8976, 1, 19712, 256, 256, 256, 19712] + - [7, 10892.0] + - - [256, 8976, 1, 19968, 256, 256, 256, 19968] + - [28, 10789.0] + - - [256, 8976, 1, 20480, 256, 256, 256, 20480] + - [11, 10684.0] + - - [256, 8976, 1, 2048, 256, 256, 256, 2048] + - [36, 10884.0] + - - [256, 8976, 1, 20992, 256, 256, 256, 20992] + - [28, 10853.0] + - - [256, 8976, 1, 22016, 256, 256, 256, 22016] + - [13, 10871.0] + - - [256, 8976, 1, 2304, 256, 256, 256, 2304] + - [36, 10914.0] + - - [256, 8976, 1, 2560, 256, 256, 256, 2560] + - [36, 10954.0] + - - [256, 8976, 1, 26112, 256, 256, 256, 26112] + - [28, 10778.0] + - - [256, 8976, 1, 2816, 256, 256, 256, 2816] + - [36, 10957.0] + - - [256, 8976, 1, 3072, 256, 256, 256, 3072] + - [36, 10963.0] + - - [256, 8976, 1, 33536, 256, 256, 256, 33536] + - [13, 10880.0] + - - [256, 8976, 1, 4352, 256, 256, 256, 4352] + - [28, 10724.0] + - - [256, 8976, 1, 44505, 256, 256, 256, 44505] + - [0, 10965.0] + - - [256, 8976, 1, 4864, 256, 256, 256, 4864] + - [36, 10983.0] + - - [256, 8976, 1, 5376, 256, 256, 256, 5376] + - [36, 10957.0] + - - [256, 8976, 1, 5632, 256, 256, 256, 5632] + - [36, 10970.0] + - - [256, 8976, 1, 5888, 256, 256, 256, 5888] + - [36, 10995.0] + - - [256, 8976, 1, 6144, 256, 256, 256, 6144] + - [22, 10837.0] + - - [256, 8976, 1, 6656, 256, 256, 256, 6656] + - [36, 10954.0] + - - [256, 8976, 1, 7168, 256, 256, 256, 7168] + - [22, 10866.0] + - - [256, 8976, 1, 7424, 256, 256, 256, 7424] + - [7, 10957.0] + - - [256, 8976, 1, 8192, 256, 256, 256, 8192] + - [11, 10623.0] + - - [256, 8976, 1, 8448, 256, 256, 256, 8448] + - [7, 10986.0] + - - [256, 8976, 1, 8960, 256, 256, 256, 8960] + - [36, 10962.0] + - - [256, 8976, 1, 9472, 256, 256, 256, 9472] + - [7, 10945.0] + - - [256, 8976, 1, 9728, 256, 256, 256, 9728] + - [7, 10959.0] + - - [256, 8976, 1, 9984, 256, 256, 256, 9984] + - [7, 10993.0] + - - [3200, 1024, 1, 2048, 3200, 3200, 3200, 2048] + - [1, 12298.0] + - - [4096, 1024, 1, 1, 4096, 4096, 4096, 1] + - [7, 156.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 1024, 4096] + - [40, 12064.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 11816.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] + - [7, 11282.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 30528, 1024] + - [37, 12534.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 30528, 1024] + - [23, 12576.0] + - - [512, 32768, 1, 256, 512, 512, 512, 256] + - [23, 12096.0] + - - [256, 32768, 1, 128, 256, 256, 256, 128] + - [16, 11331.0] + - - [1024, 32768, 1, 512, 1024, 1024, 1024, 512] + - [23, 12441.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12534.0] + - - [479, 32768, 1, 1024, 479, 479, 479, 1024] + - [40, 11456.0] + - - [289, 128, 64, 768, 289, 289, 289, 768] + - [23, 8442.0] + - - [289, 160, 64, 768, 289, 289, 289, 768] + - [0, 7112.0] + - - [289, 192, 64, 768, 289, 289, 289, 768] + - [19, 8386.0] + - - [3136, 256, 64, 64, 3136, 3136, 3136, 64] + - [0, 10677.0] + - - [784, 512, 64, 128, 784, 784, 784, 128] + - [3, 10164.0] + - - [784, 128, 64, 512, 784, 784, 784, 512] + - [34, 10200.0] + - - [196, 1024, 64, 256, 196, 196, 196, 256] + - [11, 8821.0] + - - [196, 256, 64, 1024, 196, 196, 196, 1024] + - [40, 8948.0] + - - [3136, 256, 32, 64, 3136, 3136, 3136, 64] + - [0, 11047.0] + - - [784, 512, 32, 128, 784, 784, 784, 128] + - [31, 10093.0] + - - [784, 128, 32, 512, 784, 784, 784, 512] + - [7, 9822.0] + - - [196, 1024, 32, 256, 196, 196, 196, 256] + - [3, 8631.0] + - - [256, 6912, 1, 4, 256, 256, 256, 4] + - [7, 685.0] + - - [512, 4096, 1, 256, 512, 512, 512, 256] + - [16, 10846.0] + - - [1024, 4096, 1, 512, 1024, 1024, 1024, 512] + - [3, 11749.0] + - - [480, 4096, 1, 1024, 480, 480, 480, 1024] + - [22, 10535.0] + - - [512, 6912, 1, 256, 512, 512, 512, 256] + - [22, 10701.0] + - - [1024, 6912, 1, 512, 1024, 1024, 1024, 512] + - [26, 11810.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12087.0] + - - [480, 6912, 1, 1024, 480, 480, 480, 1024] + - [8, 11061.0] + - - [256, 55296, 1, 128, 256, 256, 256, 128] + - [16, 11370.0] + - - [512, 55296, 1, 256, 512, 512, 512, 256] + - [26, 12190.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 1920, 2048] + - [1, 12348.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 2880, 3072] + - [8, 12068.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 3840, 4096] + - [3, 12517.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 7680, 8192] + - [11, 12295.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [8, 12110.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [3, 12386.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [8, 12462.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [3, 12346.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [16, 10162.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [30, 11174.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [17, 11650.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [1, 11946.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [17, 12077.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [17, 12445.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [3, 12499.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [1, 12549.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [1, 12591.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [31, 12514.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [37, 12507.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [31, 12567.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [40, 12498.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [17, 12469.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [17, 12528.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [34, 12551.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [20, 12277.0] + - - [1152, 1152, 1, 384, 1152, 1152, 1152, 384] + - [30, 8003.0] + - - [1536, 1536, 1, 384, 1536, 1536, 1536, 384] + - [30, 9551.0] + - - [1920, 1920, 1, 384, 1920, 1920, 1920, 384] + - [30, 10279.0] + - - [2304, 2304, 1, 384, 2304, 2304, 2304, 384] + - [37, 11451.0] + - - [2688, 2688, 1, 384, 2688, 2688, 2688, 384] + - [17, 11774.0] + - - [3072, 3072, 1, 384, 3072, 3072, 3072, 384] + - [3, 12088.0] + - - [3456, 3456, 1, 384, 3456, 3456, 3456, 384] + - [17, 12222.0] + - - [3840, 3840, 1, 384, 3840, 3840, 3840, 384] + - [1, 12334.0] + - - [4224, 4224, 1, 384, 4224, 4224, 4224, 384] + - [17, 12394.0] + - - [4608, 4608, 1, 384, 4608, 4608, 4608, 384] + - [1, 12445.0] + - - [4992, 4992, 1, 384, 4992, 4992, 4992, 384] + - [8, 12406.0] + - - [5376, 5376, 1, 384, 5376, 5376, 5376, 384] + - [1, 12436.0] + - - [5760, 5760, 1, 384, 5760, 5760, 5760, 384] + - [17, 12474.0] + - - [6144, 6144, 1, 384, 6144, 6144, 6144, 384] + - [23, 12471.0] + - - [6528, 6528, 1, 384, 6528, 6528, 6528, 384] + - [37, 12518.0] + - - [6912, 6912, 1, 384, 6912, 6912, 6912, 384] + - [17, 12555.0] + - - [7296, 7296, 1, 384, 7296, 7296, 7296, 384] + - [17, 12566.0] + - - [7680, 7680, 1, 384, 7680, 7680, 7680, 384] + - [1, 12567.0] + - - [8064, 8064, 1, 384, 8064, 8064, 8064, 384] + - [17, 12584.0] + - - [8448, 8448, 1, 384, 8448, 8448, 8448, 384] + - [1, 12584.0] + - - [8832, 8832, 1, 384, 8832, 8832, 8832, 384] + - [17, 12598.0] + - - [9216, 9216, 1, 384, 9216, 9216, 9216, 384] + - [8, 12577.0] + - - [9600, 9600, 1, 384, 9600, 9600, 9600, 384] + - [1, 12611.0] + - - [9984, 9984, 1, 384, 9984, 9984, 9984, 384] + - [1, 12600.0] + - - [10368, 10368, 1, 384, 10368, 10368, 10368, 384] + - [23, 12612.0] + - - [10752, 10752, 1, 384, 10752, 10752, 10752, 384] + - [17, 12605.0] + - - [11136, 11136, 1, 384, 11136, 11136, 11136, 384] + - [17, 12622.0] + - - [11520, 11520, 1, 384, 11520, 11520, 11520, 384] + - [1, 12613.0] + - - [11904, 11904, 1, 384, 11904, 11904, 11904, 384] + - [17, 12632.0] + - - [12288, 12288, 1, 384, 12288, 12288, 12288, 384] + - [1, 12609.0] + - - [12672, 12672, 1, 384, 12672, 12672, 12672, 384] + - [17, 12631.0] + - - [13056, 13056, 1, 384, 13056, 13056, 13056, 384] + - [17, 12615.0] + - - [13440, 13440, 1, 384, 13440, 13440, 13440, 384] + - [23, 12633.0] + - - [13824, 13824, 1, 384, 13824, 13824, 13824, 384] + - [17, 12623.0] + - - [14208, 14208, 1, 384, 14208, 14208, 14208, 384] + - [23, 12631.0] + - - [14592, 14592, 1, 384, 14592, 14592, 14592, 384] + - [17, 12623.0] + - - [14976, 14976, 1, 384, 14976, 14976, 14976, 384] + - [17, 12641.0] + - - [15360, 15360, 1, 384, 15360, 15360, 15360, 384] + - [23, 12629.0] + - - [15744, 15744, 1, 384, 15744, 15744, 15744, 384] + - [17, 12636.0] + - - [16128, 16128, 1, 384, 16128, 16128, 16128, 384] + - [17, 12632.0] + - - [16512, 16512, 1, 384, 16512, 16512, 16512, 384] + - [17, 12654.0] + - - [16896, 16896, 1, 384, 16896, 16896, 16896, 384] + - [23, 12626.0] + - - [17280, 17280, 1, 384, 17280, 17280, 17280, 384] + - [17, 12653.0] + - - [17664, 17664, 1, 384, 17664, 17664, 17664, 384] + - [23, 12458.0] + - - [18048, 18048, 1, 384, 18048, 18048, 18048, 384] + - [1, 12486.0] + - - [18432, 18432, 1, 384, 18432, 18432, 18432, 384] + - [1, 12483.0] + - - [18816, 18816, 1, 384, 18816, 18816, 18816, 384] + - [17, 12506.0] + - - [19200, 19200, 1, 384, 19200, 19200, 19200, 384] + - [8, 12496.0] + - - [19584, 19584, 1, 384, 19584, 19584, 19584, 384] + - [1, 12516.0] + - - [19968, 19968, 1, 384, 19968, 19968, 19968, 384] + - [8, 12508.0] + - - [20352, 20352, 1, 384, 20352, 20352, 20352, 384] + - [17, 12520.0] + - - [20736, 20736, 1, 384, 20736, 20736, 20736, 384] + - [8, 12515.0] + - - [21120, 21120, 1, 384, 21120, 21120, 21120, 384] + - [1, 12527.0] + - - [21504, 21504, 1, 384, 21504, 21504, 21504, 384] + - [1, 12515.0] + - - [21888, 21888, 1, 384, 21888, 21888, 21888, 384] + - [23, 12523.0] + - - [22272, 22272, 1, 384, 22272, 22272, 22272, 384] + - [8, 12526.0] + - - [22656, 22656, 1, 384, 22656, 22656, 22656, 384] + - [8, 12537.0] + - - [23040, 23040, 1, 384, 23040, 23040, 23040, 384] + - [8, 12534.0] + - - [8192, 1024, 1, 1024, 8192, 8192, 8192, 1024] + - [1, 12216.0] + - - [8192, 4096, 1, 1024, 8192, 8192, 8192, 1024] + - [8, 12554.0] + - - [16384, 16384, 1, 16384, 16384, 16384, 16384, 16384] + - [13, 11918.0] + - - [1444, 256, 120, 128, 1444, 1444, 1444, 128] + - [3, 10676.0] + - - [1444, 256, 139, 128, 1444, 1444, 1444, 128] + - [0, 10743.0] + - - [1444, 256, 160, 128, 1444, 1444, 1444, 128] + - [0, 10759.0] + - - [1444, 256, 18, 128, 1444, 1444, 1444, 128] + - [16, 10525.0] + - - [1444, 256, 19, 128, 1444, 1444, 1444, 128] + - [30, 10665.0] + - - [1444, 256, 120, 256, 1444, 1444, 1444, 256] + - [20, 11456.0] + - - [1444, 256, 139, 256, 1444, 1444, 1444, 256] + - [20, 11483.0] + - - [1444, 256, 160, 256, 1444, 1444, 1444, 256] + - [34, 11488.0] + - - [1444, 256, 18, 256, 1444, 1444, 1444, 256] + - [17, 10910.0] + - - [1444, 256, 19, 256, 1444, 1444, 1444, 256] + - [1, 11051.0] + - - [361, 256, 120, 512, 361, 361, 361, 512] + - [26, 10995.0] + - - [361, 256, 139, 512, 361, 361, 361, 512] + - [11, 11107.0] + - - [361, 256, 160, 512, 361, 361, 361, 512] + - [40, 11105.0] + - - [361, 256, 18, 512, 361, 361, 361, 512] + - [19, 9720.0] + - - [361, 256, 19, 512, 361, 361, 361, 512] + - [38, 9985.0] + - - [173280, 128, 1, 64, 173280, 173280, 173280, 64] + - [8, 8137.0] + - - [200716, 128, 1, 64, 200716, 200716, 200716, 64] + - [19, 6774.0] + - - [231040, 128, 1, 64, 231040, 231040, 231040, 64] + - [33, 7972.0] + - - [25992, 128, 1, 64, 25992, 25992, 25992, 64] + - [7, 8517.0] + - - [27436, 128, 1, 64, 27436, 27436, 27436, 64] + - [30, 9420.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [3, 12313.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [1, 12586.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [1, 12221.0] + - - [1024, 1280, 1, 2, 1024, 1024, 1024, 2] + - [7, 315.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 1024, 4096] + - [41, 11760.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12216.0] + - - [1024, 4992, 1, 2, 1024, 1024, 1024, 2] + - [12, 340.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11997.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12403.0] + - - [1024, 5120, 1, 2, 1024, 1024, 1024, 2] + - [0, 556.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12237.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12384.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12607.0] + - - [1024, 5248, 1, 2, 1024, 1024, 1024, 2] + - [0, 528.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11907.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 1024, 4096] + - [3, 11887.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12467.0] + - - [1024, 2560, 1, 2, 1024, 1024, 1024, 2] + - [0, 388.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12182.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12463.0] + - - [1024, 1152, 1, 2, 1024, 1024, 1024, 2] + - [7, 324.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 1024, 4096] + - [5, 10561.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 4096, 1024] + - [16, 11675.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12182.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12212.0] + - - [1024, 8192, 1, 33712, 1024, 1024, 1024, 33712] + - [17, 12239.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 12289.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12376.0] + - - [1024, 9600, 1, 33712, 1024, 1024, 1024, 33712] + - [17, 12465.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12444.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12510.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12174.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12362.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12377.0] + - - [1024, 10080, 1, 42720, 1024, 1024, 1024, 42720] + - [8, 12162.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11926.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12050.0] + - - [1024, 6528, 1, 42720, 1024, 1024, 1024, 42720] + - [17, 12055.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 11845.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11974.0] + - - [1024, 7104, 1, 42720, 1024, 1024, 1024, 42720] + - [17, 12000.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12000.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 1024, 4096] + - [11, 12077.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12288.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 12339.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12488.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12483.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12495.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12397.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12551.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12509.0] + - - [480, 32768, 1, 1024, 480, 480, 480, 1024] + - [37, 11500.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [7, 11567.0] + - - [2048, 1024, 1, 30592, 2048, 2048, 2048, 30592] + - [33, 11099.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 2048, 6144] + - [7, 11458.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 2048, 8192] + - [22, 11313.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 8192, 2048] + - [8, 12256.0] + - - [1024, 8192, 1, 30592, 1024, 1024, 1024, 30592] + - [5, 12207.0] + - - [1024, 8192, 1, 3072, 1024, 1024, 1024, 3072] + - [26, 12169.0] + - - [512, 512, 256, 64, 512, 512, 512, 64] + - [14, 7741.0] + - - [1024, 2048, 1, 30592, 1024, 1024, 1024, 30592] + - [33, 11083.0] + - - [1024, 4096, 1, 30592, 1024, 1024, 1024, 30592] + - [20, 11978.0] + - - [512, 512, 128, 64, 512, 512, 512, 64] + - [0, 9590.0] + - - [2560, 2048, 1, 1920, 2560, 2560, 2560, 1920] + - [1, 12433.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [1, 12464.0] + - - [2560, 2048, 1, 7680, 2560, 2560, 2560, 7680] + - [28, 12456.0] + - - [640, 2048, 1, 2560, 640, 640, 640, 2560] + - [5, 11822.0] + - - [512, 512, 40, 64, 512, 512, 512, 64] + - [16, 10983.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [1, 11965.0] + - - [1536, 4096, 1, 4608, 1536, 1536, 1536, 4608] + - [23, 11947.0] + - - [1536, 4096, 1, 50304, 1536, 1536, 1536, 50304] + - [5, 11930.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 1536, 6144] + - [41, 11954.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 6144, 1536] + - [1, 12592.0] + - - [1024, 1024, 64, 96, 1024, 1024, 1024, 96] + - [11, 11395.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [23, 12403.0] + - - [1536, 8192, 1, 4608, 1536, 1536, 1536, 4608] + - [1, 12444.0] + - - [1536, 8192, 1, 50304, 1536, 1536, 1536, 50304] + - [3, 12364.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 1536, 6144] + - [37, 12415.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 6144, 1536] + - [1, 12677.0] + - - [1024, 1024, 128, 96, 1024, 1024, 1024, 96] + - [11, 11511.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12372.0] + - - [1024, 16384, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12467.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12460.0] + - - [1024, 16384, 1, 50304, 1024, 1024, 1024, 50304] + - [5, 12264.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 4096, 1024] + - [3, 12537.0] + - - [1024, 1024, 256, 64, 1024, 1024, 1024, 64] + - [25, 8901.0] + - - [1024, 2048, 1, 50304, 1024, 1024, 1024, 50304] + - [20, 11091.0] + - - [1024, 1024, 32, 64, 1024, 1024, 1024, 64] + - [0, 10243.0] + - - [1024, 4096, 1, 50304, 1024, 1024, 1024, 50304] + - [3, 12057.0] + - - [1024, 1024, 64, 64, 1024, 1024, 1024, 64] + - [10, 8886.0] + - - [1024, 8192, 1, 50304, 1024, 1024, 1024, 50304] + - [3, 12060.0] + - - [1024, 1024, 128, 64, 1024, 1024, 1024, 64] + - [10, 8893.0] + - - [128, 128, 1024, 64, 128, 128, 128, 64] + - [29, 6058.0] + - - [1024, 8192, 1, 30528, 1024, 1024, 1024, 30528] + - [13, 12286.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [3, 11631.0] + - - [1024, 3456, 1, 512, 1024, 1024, 1024, 512] + - [3, 11579.0] + - - [256, 6912, 1, 128, 256, 256, 256, 128] + - [0, 9951.0] + - - [480, 3456, 1, 1024, 480, 480, 480, 1024] + - [22, 10400.0] + - - [512, 3456, 1, 256, 512, 512, 512, 256] + - [22, 9899.0] + - - [1024, 1280, 1, 30528, 1024, 1024, 1024, 30528] + - [5, 12143.0] + - - [1024, 1600, 1, 30528, 1024, 1024, 1024, 30528] + - [0, 11516.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12445.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12521.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12647.0] + - - [128, 128, 1280, 64, 128, 128, 128, 64] + - [0, 7484.0] + - - [1024, 1640, 1, 30528, 1024, 1024, 1024, 30528] + - [0, 10800.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12366.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12445.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12578.0] + - - [128, 128, 1312, 64, 128, 128, 128, 64] + - [0, 7335.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 12031.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12575.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11910.0] + - - [512, 512, 192, 64, 512, 512, 512, 64] + - [0, 11218.0] + - - [256, 6912, 1, 1, 256, 256, 256, 1] + - [1, 213.0] + - - [3136, 128, 64, 64, 3136, 3136, 3136, 64] + - [33, 8075.0] + - - [3136, 256, 64, 128, 3136, 3136, 3136, 128] + - [3, 11592.0] + - - [784, 512, 64, 256, 784, 784, 784, 256] + - [3, 10623.0] + - - [3136, 128, 64, 256, 3136, 3136, 3136, 256] + - [3, 11808.0] + - - [3136, 256, 64, 256, 3136, 3136, 3136, 256] + - [34, 12013.0] + - - [196, 1024, 64, 512, 196, 196, 196, 512] + - [40, 9171.0] + - - [784, 256, 64, 512, 784, 784, 784, 512] + - [20, 10639.0] + - - [784, 512, 64, 512, 784, 784, 784, 512] + - [3, 10772.0] + - - [196, 512, 64, 1024, 196, 196, 196, 1024] + - [40, 9143.0] + - - [196, 1024, 64, 1024, 196, 196, 196, 1024] + - [11, 9294.0] + - - [3136, 128, 32, 64, 3136, 3136, 3136, 64] + - [16, 10769.0] + - - [3136, 256, 32, 128, 3136, 3136, 3136, 128] + - [0, 11303.0] + - - [784, 512, 32, 256, 784, 784, 784, 256] + - [31, 10498.0] + - - [3136, 128, 32, 256, 3136, 3136, 3136, 256] + - [20, 11605.0] + - - [3136, 256, 32, 256, 3136, 3136, 3136, 256] + - [3, 11835.0] + - - [196, 1024, 32, 512, 196, 196, 196, 512] + - [11, 9021.0] + - - [784, 256, 32, 512, 784, 784, 784, 512] + - [37, 10265.0] + - - [784, 512, 32, 512, 784, 784, 784, 512] + - [20, 10641.0] + - - [196, 512, 32, 1024, 196, 196, 196, 1024] + - [11, 8993.0] + - - [196, 1024, 32, 1024, 196, 196, 196, 1024] + - [40, 9170.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12417.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12376.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 12400.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12494.0] + - - [1024, 10224, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 12471.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12622.0] + - - [1024, 10240, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 12495.0] + - - [1024, 10192, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 12451.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12594.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12472.0] + - - [1024, 10200, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12446.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12362.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12611.0] + - - [1024, 10208, 1, 3072, 1024, 1024, 1024, 3072] + - [8, 12468.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12480.0] + - - [1024, 10224, 1, 2048, 1024, 1024, 1024, 2048] + - [23, 12455.0] + - - [1024, 10240, 1, 2048, 1024, 1024, 1024, 2048] + - [8, 12482.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12299.0] + - - [1024, 10192, 1, 2048, 1024, 1024, 1024, 2048] + - [8, 12449.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12349.0] + - - [1024, 10080, 1, 3072, 1024, 1024, 1024, 3072] + - [23, 12359.0] + - - [100352, 512, 1, 256, 100352, 100352, 100352, 256] + - [34, 12375.0] + - - [12544, 2048, 1, 1024, 12544, 12544, 12544, 1024] + - [1, 12572.0] + - - [200704, 512, 1, 256, 200704, 200704, 200704, 256] + - [20, 12430.0] + - - [25088, 1024, 1, 512, 25088, 25088, 25088, 512] + - [8, 12464.0] + - - [50176, 1024, 1, 512, 50176, 50176, 50176, 512] + - [20, 12493.0] + - - [6272, 2048, 1, 1024, 6272, 6272, 6272, 1024] + - [1, 12376.0] + - - [196, 1024, 128, 256, 196, 196, 196, 256] + - [11, 9025.0] + - - [196, 1024, 256, 256, 196, 196, 196, 256] + - [3, 9143.0] + - - [196, 256, 128, 1024, 196, 196, 196, 1024] + - [40, 9101.0] + - - [196, 256, 256, 1024, 196, 196, 196, 1024] + - [40, 9234.0] + - - [196, 512, 128, 1024, 196, 196, 196, 1024] + - [26, 9298.0] + - - [196, 512, 256, 1024, 196, 196, 196, 1024] + - [40, 9428.0] + - - [3136, 128, 128, 256, 3136, 3136, 3136, 256] + - [3, 12039.0] + - - [3136, 128, 256, 256, 3136, 3136, 3136, 256] + - [34, 12093.0] + - - [784, 256, 128, 512, 784, 784, 784, 512] + - [3, 10801.0] + - - [784, 256, 256, 512, 784, 784, 784, 512] + - [3, 10857.0] + - - [128, 128, 2048, 64, 128, 128, 128, 64] + - [0, 6029.0] + - - [1024, 2560, 1, 30528, 1024, 1024, 1024, 30528] + - [5, 12360.0] + - - [128, 128, 1536, 64, 128, 128, 128, 64] + - [0, 6790.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 1024, 4096] + - [23, 12417.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [23, 12332.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12641.0] + - - [1024, 1920, 1, 30528, 1024, 1024, 1024, 30528] + - [8, 11994.0] + - - [128, 128, 192, 64, 128, 128, 128, 64] + - [18, 8538.0] + - - [384, 384, 144, 64, 384, 384, 384, 64] + - [0, 11328.0] + - - [768, 4608, 1, 2, 768, 768, 768, 2] + - [0, 532.0] + - - [3072, 4608, 1, 768, 3072, 3072, 3072, 768] + - [8, 12323.0] + - - [768, 4608, 1, 3072, 768, 768, 768, 3072] + - [37, 12040.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [1, 11789.0] + - - [512, 512, 48, 64, 512, 512, 512, 64] + - [3, 8906.0] + - - [128, 128, 256, 64, 128, 128, 128, 64] + - [12, 9026.0] + - - [384, 384, 192, 64, 384, 384, 384, 64] + - [0, 8165.0] + - - [1024, 4608, 1, 2, 1024, 1024, 1024, 2] + - [7, 480.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12539.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 1024, 4096] + - [8, 11891.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [8, 11672.0] + - - [3072, 256, 2, 1024, 3072, 3072, 3072, 1024] + - [1, 11103.0] + - - [2852, 256, 2, 1024, 2852, 2852, 2852, 1024] + - [18, 10207.0] + - - [3220, 256, 2, 1024, 3220, 3220, 3220, 1024] + - [16, 10101.0] + - - [850, 2048, 2, 512, 850, 850, 850, 512] + - [16, 10031.0] + - - [768, 2048, 2, 512, 768, 768, 768, 512] + - [0, 10616.0] + - - [2904, 256, 2, 1024, 2904, 2904, 2904, 1024] + - [18, 10157.0] + - - [805, 2048, 2, 512, 805, 805, 805, 512] + - [16, 9779.0] + - - [864, 2048, 2, 512, 864, 864, 864, 512] + - [16, 10575.0] + - - [2992, 256, 2, 1024, 2992, 2992, 2992, 1024] + - [32, 10409.0] + - - [3400, 256, 2, 1024, 3400, 3400, 3400, 1024] + - [30, 10618.0] + - - [4032, 256, 2, 1024, 4032, 4032, 4032, 1024] + - [30, 10815.0] + - - [15200, 128, 2, 512, 15200, 15200, 15200, 512] + - [34, 11386.0] + - - [12288, 128, 2, 512, 12288, 12288, 12288, 512] + - [37, 11442.0] + - - [888, 2048, 2, 512, 888, 888, 888, 512] + - [16, 11157.0] + - - [13600, 128, 2, 512, 13600, 13600, 13600, 512] + - [37, 11348.0] + - - [12880, 128, 2, 512, 12880, 12880, 12880, 512] + - [16, 10972.0] + - - [3456, 256, 2, 1024, 3456, 3456, 3456, 1024] + - [30, 11231.0] + - - [2944, 256, 2, 1024, 2944, 2944, 2944, 1024] + - [40, 10405.0] + - - [2688, 256, 2, 1024, 2688, 2688, 2688, 1024] + - [16, 10242.0] + - - [13824, 128, 2, 512, 13824, 13824, 13824, 512] + - [20, 11630.0] + - - [3036, 256, 2, 1024, 3036, 3036, 3036, 1024] + - [1, 10865.0] + - - [3168, 256, 2, 1024, 3168, 3168, 3168, 1024] + - [2, 10887.0] + - - [3360, 256, 2, 1024, 3360, 3360, 3360, 1024] + - [30, 10498.0] + - - [3552, 256, 2, 1024, 3552, 3552, 3552, 1024] + - [38, 10526.0] + - - [11616, 128, 2, 512, 11616, 11616, 11616, 512] + - [30, 10578.0] + - - [4200, 256, 2, 1024, 4200, 4200, 4200, 1024] + - [18, 10833.0] + - - [840, 2048, 2, 512, 840, 840, 840, 512] + - [16, 10584.0] + - - [14208, 128, 2, 512, 14208, 14208, 14208, 512] + - [30, 11183.0] + - - [11968, 128, 2, 512, 11968, 11968, 11968, 512] + - [30, 11197.0] + - - [3264, 256, 2, 1024, 3264, 3264, 3264, 1024] + - [16, 10518.0] + - - [713, 2048, 2, 512, 713, 713, 713, 512] + - [30, 8811.0] + - - [13600, 256, 2, 512, 13600, 13600, 13600, 512] + - [8, 11619.0] + - - [12880, 256, 2, 512, 12880, 12880, 12880, 512] + - [1, 11584.0] + - - [12288, 256, 2, 512, 12288, 12288, 12288, 512] + - [1, 11756.0] + - - [2816, 256, 2, 1024, 2816, 2816, 2816, 1024] + - [30, 10966.0] + - - [850, 2048, 1, 512, 850, 850, 850, 512] + - [16, 9945.0] + - - [660, 2048, 2, 512, 660, 660, 660, 512] + - [24, 9531.0] + - - [672, 2048, 2, 512, 672, 672, 672, 512] + - [23, 9766.0] + - - [13440, 128, 2, 512, 13440, 13440, 13440, 512] + - [16, 11561.0] + - - [726, 2048, 2, 512, 726, 726, 726, 512] + - [16, 10458.0] + - - [3500, 256, 2, 1024, 3500, 3500, 3500, 1024] + - [1, 10616.0] + - - [13824, 256, 2, 512, 13824, 13824, 13824, 512] + - [3, 11899.0] + - - [15200, 256, 2, 512, 15200, 15200, 15200, 512] + - [40, 11871.0] + - - [3700, 256, 2, 1024, 3700, 3700, 3700, 1024] + - [32, 11118.0] + - - [748, 2048, 2, 512, 748, 748, 748, 512] + - [3, 10809.0] + - - [3600, 256, 2, 1024, 3600, 3600, 3600, 1024] + - [9, 10891.0] + - - [4032, 1024, 2, 256, 4032, 4032, 4032, 256] + - [1, 11618.0] + - - [16128, 128, 2, 512, 16128, 16128, 16128, 512] + - [34, 11455.0] + - - [15200, 128, 1, 512, 15200, 15200, 15200, 512] + - [24, 11068.0] + - - [13600, 128, 1, 512, 13600, 13600, 13600, 512] + - [36, 10638.0] + - - [2904, 1024, 2, 256, 2904, 2904, 2904, 256] + - [30, 11320.0] + - - [2992, 1024, 2, 256, 2992, 2992, 2992, 256] + - [17, 11152.0] + - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] + - [8, 11534.0] + - - [24576, 128, 1, 256, 24576, 24576, 24576, 256] + - [16, 10948.0] + - - [24576, 512, 1, 256, 24576, 24576, 24576, 256] + - [3, 11915.0] + - - [25760, 128, 1, 256, 25760, 25760, 25760, 256] + - [7, 10575.0] + - - [25760, 512, 1, 256, 25760, 25760, 25760, 256] + - [8, 11990.0] + - - [6144, 256, 1, 512, 6144, 6144, 6144, 512] + - [18, 10921.0] + - - [6440, 256, 1, 512, 6440, 6440, 6440, 512] + - [16, 10124.0] + - - [3036, 1024, 2, 256, 3036, 3036, 3036, 256] + - [16, 11275.0] + - - [13600, 512, 1, 128, 13600, 13600, 13600, 128] + - [0, 11028.0] + - - [9408, 512, 2, 128, 9408, 9408, 9408, 128] + - [30, 11315.0] + - - [56000, 256, 2, 64, 56000, 56000, 56000, 64] + - [0, 11320.0] + - - [2852, 1024, 2, 256, 2852, 2852, 2852, 256] + - [16, 11145.0] + - - [2816, 1024, 2, 256, 2816, 2816, 2816, 256] + - [8, 11704.0] + - - [60800, 256, 1, 64, 60800, 60800, 60800, 64] + - [22, 11421.0] + - - [2944, 1024, 2, 256, 2944, 2944, 2944, 256] + - [37, 11683.0] + - - [11776, 512, 2, 128, 11776, 11776, 11776, 128] + - [8, 11683.0] + - - [11616, 512, 2, 128, 11616, 11616, 11616, 128] + - [1, 11621.0] + - - [4200, 1024, 2, 256, 4200, 4200, 4200, 256] + - [1, 11764.0] + - - [54400, 256, 1, 64, 54400, 54400, 54400, 64] + - [30, 11409.0] + - - [15200, 512, 1, 128, 15200, 15200, 15200, 128] + - [0, 11361.0] + - - [2688, 1024, 2, 256, 2688, 2688, 2688, 256] + - [1, 11857.0] + - - [12672, 512, 2, 128, 12672, 12672, 12672, 128] + - [23, 11832.0] + - - [11968, 512, 2, 128, 11968, 11968, 11968, 128] + - [0, 11467.0] + - - [46464, 256, 2, 64, 46464, 46464, 46464, 64] + - [0, 11305.0] + - - [2400, 256, 2, 1024, 2400, 2400, 2400, 1024] + - [38, 10691.0] + - - [2520, 256, 2, 1024, 2520, 2520, 2520, 1024] + - [35, 11204.0] + - - [2400, 1024, 2, 256, 2400, 2400, 2400, 256] + - [0, 11195.0] + - - [10752, 128, 2, 512, 10752, 10752, 10752, 512] + - [30, 11338.0] + - - [45632, 256, 2, 64, 45632, 45632, 45632, 64] + - [16, 10996.0] + - - [2520, 1024, 2, 256, 2520, 2520, 2520, 256] + - [37, 11409.0] + - - [53760, 256, 2, 64, 53760, 53760, 53760, 64] + - [0, 11432.0] + - - [2352, 256, 2, 1024, 2352, 2352, 2352, 1024] + - [18, 10498.0] + - - [47872, 256, 2, 64, 47872, 47872, 47872, 64] + - [1, 10679.0] + - - [47104, 256, 2, 64, 47104, 47104, 47104, 64] + - [16, 10432.0] + - - [50688, 256, 2, 64, 50688, 50688, 50688, 64] + - [0, 9521.0] + - - [45056, 256, 2, 64, 45056, 45056, 45056, 64] + - [16, 10821.0] + - - [13440, 512, 2, 128, 13440, 13440, 13440, 128] + - [37, 11864.0] + - - [2352, 1024, 2, 256, 2352, 2352, 2352, 256] + - [0, 10979.0] + - - [11264, 512, 2, 128, 11264, 11264, 11264, 128] + - [8, 11581.0] + - - [10560, 128, 2, 512, 10560, 10560, 10560, 512] + - [30, 11043.0] + - - [16128, 512, 2, 128, 16128, 16128, 16128, 128] + - [37, 11781.0] + - - [37632, 256, 2, 64, 37632, 37632, 37632, 64] + - [36, 11424.0] + - - [51520, 256, 2, 64, 51520, 51520, 51520, 64] + - [0, 11142.0] + - - [14000, 512, 2, 128, 14000, 14000, 14000, 128] + - [1, 11605.0] + - - [10560, 512, 2, 128, 10560, 10560, 10560, 128] + - [0, 11422.0] + - - [64512, 256, 2, 64, 64512, 64512, 64512, 64] + - [0, 11191.0] + - - [54400, 256, 2, 64, 54400, 54400, 54400, 64] + - [8, 8881.0] + - - [3264, 1024, 2, 256, 3264, 3264, 3264, 256] + - [8, 11547.0] + - - [10752, 512, 2, 128, 10752, 10752, 10752, 128] + - [8, 11632.0] + - - [3168, 1024, 2, 256, 3168, 3168, 3168, 256] + - [1, 11837.0] + - - [950, 2048, 1, 512, 950, 950, 950, 512] + - [16, 10385.0] + - - [55296, 256, 2, 256, 55296, 55296, 55296, 256] + - [26, 12214.0] + - - [51520, 256, 2, 256, 51520, 51520, 51520, 256] + - [34, 12112.0] + - - [11408, 128, 2, 512, 11408, 11408, 11408, 512] + - [24, 11564.0] + - - [60800, 256, 2, 256, 60800, 60800, 60800, 256] + - [34, 12330.0] + - - [54400, 256, 2, 256, 54400, 54400, 54400, 256] + - [20, 12308.0] + - - [3700, 1024, 2, 256, 3700, 3700, 3700, 256] + - [16, 11454.0] + - - [60800, 256, 2, 64, 60800, 60800, 60800, 64] + - [0, 11022.0] + - - [3800, 1024, 1, 256, 3800, 3800, 3800, 256] + - [2, 11304.0] + - - [3400, 1024, 1, 256, 3400, 3400, 3400, 256] + - [1, 11119.0] + - - [3072, 1024, 2, 256, 3072, 3072, 3072, 256] + - [16, 11452.0] + - - [3600, 1024, 2, 256, 3600, 3600, 3600, 256] + - [37, 11223.0] + - - [12288, 512, 2, 128, 12288, 12288, 12288, 128] + - [16, 11452.0] + - - [49152, 256, 2, 256, 49152, 49152, 49152, 256] + - [26, 11974.0] + - - [12880, 512, 2, 128, 12880, 12880, 12880, 128] + - [2, 11480.0] + - - [11408, 512, 2, 128, 11408, 11408, 11408, 128] + - [1, 11554.0] + - - [42240, 256, 2, 64, 42240, 42240, 42240, 64] + - [16, 11394.0] + - - [1008, 2048, 2, 512, 1008, 1008, 1008, 512] + - [1, 11515.0] + - - [3360, 1024, 2, 256, 3360, 3360, 3360, 256] + - [1, 11433.0] + - - [14208, 512, 2, 128, 14208, 14208, 14208, 128] + - [37, 11701.0] + - - [56832, 256, 2, 64, 56832, 56832, 56832, 64] + - [0, 9689.0] + - - [43008, 256, 2, 64, 43008, 43008, 43008, 64] + - [16, 11167.0] + - - [13600, 512, 2, 128, 13600, 13600, 13600, 128] + - [1, 11677.0] + - - [3500, 1024, 2, 256, 3500, 3500, 3500, 256] + - [8, 11459.0] + - - [2640, 1024, 2, 256, 2640, 2640, 2640, 256] + - [1, 11641.0] + - - [13824, 512, 2, 128, 13824, 13824, 13824, 128] + - [37, 11735.0] + - - [3800, 256, 2, 1024, 3800, 3800, 3800, 1024] + - [8, 11550.0] + - - [55296, 256, 2, 64, 55296, 55296, 55296, 64] + - [0, 10045.0] + - - [2640, 256, 2, 1024, 2640, 2640, 2640, 1024] + - [7, 10228.0] + - - [15200, 512, 2, 128, 15200, 15200, 15200, 128] + - [37, 11508.0] + - - [3552, 1024, 2, 256, 3552, 3552, 3552, 256] + - [1, 11576.0] + - - [3220, 1024, 2, 256, 3220, 3220, 3220, 256] + - [3, 11246.0] + - - [3456, 1024, 2, 256, 3456, 3456, 3456, 256] + - [8, 11857.0] + - - [49152, 256, 2, 64, 49152, 49152, 49152, 64] + - [1, 9207.0] + - - [3400, 1024, 2, 256, 3400, 3400, 3400, 256] + - [8, 11533.0] + - - [950, 2048, 2, 512, 950, 950, 950, 512] + - [23, 10893.0] + - - [3800, 1024, 2, 256, 3800, 3800, 3800, 256] + - [17, 11750.0] + - - [1610, 2048, 1, 1024, 1610, 1610, 1610, 1024] + - [22, 11106.0] + - - [6912, 256, 1, 512, 6912, 6912, 6912, 512] + - [16, 11043.0] + - - [6800, 256, 1, 512, 6800, 6800, 6800, 512] + - [16, 10659.0] + - - [27648, 128, 1, 256, 27648, 27648, 27648, 256] + - [23, 11474.0] + - - [27200, 128, 1, 256, 27200, 27200, 27200, 256] + - [32, 10920.0] + - - [30400, 128, 1, 256, 30400, 30400, 30400, 256] + - [23, 11241.0] + - - [7600, 256, 1, 512, 7600, 7600, 7600, 512] + - [9, 11165.0] + - - [6144, 1024, 1, 512, 6144, 6144, 6144, 512] + - [1, 11708.0] + - - [6912, 1024, 1, 512, 6912, 6912, 6912, 512] + - [1, 12015.0] + - - [6440, 1024, 1, 512, 6440, 6440, 6440, 512] + - [1, 11589.0] + - - [27648, 512, 1, 256, 27648, 27648, 27648, 256] + - [40, 11974.0] + - - [1728, 2048, 1, 1024, 1728, 1728, 1728, 1024] + - [16, 10987.0] + - - [27200, 512, 1, 256, 27200, 27200, 27200, 256] + - [1, 11932.0] + - - [6800, 1024, 1, 512, 6800, 6800, 6800, 512] + - [37, 11713.0] + - - [1700, 2048, 1, 1024, 1700, 1700, 1700, 1024] + - [7, 10799.0] + - - [7600, 1024, 1, 512, 7600, 7600, 7600, 512] + - [1, 12188.0] + - - [30400, 512, 1, 256, 30400, 30400, 30400, 256] + - [8, 12101.0] + - - [1900, 2048, 1, 1024, 1900, 1900, 1900, 1024] + - [8, 11967.0] + - - [12544, 1024, 1, 1024, 12544, 12544, 12544, 1024] + - [1, 12313.0] + - - [1024, 1024, 160, 96, 1024, 1024, 1024, 96] + - [11, 11522.0] + - - [1920, 16384, 1, 25216, 1920, 1920, 1920, 25216] + - [17, 12476.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 3840, 1920] + - [1, 12650.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 1920, 3840] + - [3, 12016.0] + - - [960, 16384, 1, 1920, 960, 960, 960, 1920] + - [1, 11627.0] + - - [1920, 16384, 1, 2880, 1920, 1920, 1920, 2880] + - [17, 12702.0] + - - [1024, 1024, 40, 96, 1024, 1024, 1024, 96] + - [0, 11561.0] + - - [1920, 4096, 1, 25216, 1920, 1920, 1920, 25216] + - [17, 12388.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 3840, 1920] + - [17, 12613.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 1920, 3840] + - [20, 12407.0] + - - [960, 4096, 1, 1920, 960, 960, 960, 1920] + - [1, 11392.0] + - - [1920, 4096, 1, 2880, 1920, 1920, 1920, 2880] + - [31, 12571.0] + - - [1024, 1024, 80, 96, 1024, 1024, 1024, 96] + - [11, 11504.0] + - - [1920, 8192, 1, 25216, 1920, 1920, 1920, 25216] + - [3, 12258.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 3840, 1920] + - [17, 12666.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 1920, 3840] + - [20, 12502.0] + - - [960, 8192, 1, 1920, 960, 960, 960, 1920] + - [17, 11528.0] + - - [1920, 8192, 1, 2880, 1920, 1920, 1920, 2880] + - [17, 12661.0] + - - [1024, 1024, 96, 96, 1024, 1024, 1024, 96] + - [11, 11407.0] + - - [2304, 16384, 1, 12672, 2304, 2304, 2304, 12672] + - [17, 12518.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [8, 12593.0] + - - [576, 16384, 1, 2304, 576, 576, 576, 2304] + - [23, 11172.0] + - - [2304, 16384, 1, 1728, 2304, 2304, 2304, 1728] + - [17, 12627.0] + - - [1024, 1024, 24, 96, 1024, 1024, 1024, 96] + - [7, 11588.0] + - - [2304, 4096, 1, 12672, 2304, 2304, 2304, 12672] + - [17, 12339.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [17, 12471.0] + - - [576, 4096, 1, 2304, 576, 576, 576, 2304] + - [17, 10922.0] + - - [2304, 4096, 1, 1728, 2304, 2304, 2304, 1728] + - [1, 12522.0] + - - [1024, 1024, 48, 96, 1024, 1024, 1024, 96] + - [11, 11417.0] + - - [2304, 8192, 1, 12672, 2304, 2304, 2304, 12672] + - [17, 12386.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [23, 12508.0] + - - [576, 8192, 1, 2304, 576, 576, 576, 2304] + - [37, 11147.0] + - - [2304, 8192, 1, 1728, 2304, 2304, 2304, 1728] + - [17, 12606.0] + - - [1024, 1024, 16, 96, 1024, 1024, 1024, 96] + - [8, 11581.0] + - - [3072, 4096, 1, 6400, 3072, 3072, 3072, 6400] + - [1, 12475.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 1536, 3072] + - [23, 12002.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 3072, 1536] + - [1, 12356.0] + - - [384, 4096, 1, 3072, 384, 384, 384, 3072] + - [8, 11380.0] + - - [3072, 4096, 1, 1152, 3072, 3072, 3072, 1152] + - [17, 12354.0] + - - [1024, 1024, 32, 96, 1024, 1024, 1024, 96] + - [11, 11347.0] + - - [3072, 8192, 1, 6400, 3072, 3072, 3072, 6400] + - [1, 12623.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 1536, 3072] + - [26, 12287.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 3072, 1536] + - [3, 12495.0] + - - [384, 8192, 1, 3072, 384, 384, 384, 3072] + - [28, 11688.0] + - - [3072, 8192, 1, 1152, 3072, 3072, 3072, 1152] + - [1, 12576.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [1, 12331.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 2048, 4096] + - [11, 12242.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 4096, 2048] + - [1, 12354.0] + - - [1024, 2283, 1, 29000, 1024, 1024, 1024, 29000] + - [7, 11162.0] + - - [1024, 2296, 1, 29000, 1024, 1024, 1024, 29000] + - [7, 11217.0] + - - [1024, 2306, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11261.0] + - - [1024, 2309, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11272.0] + - - [1024, 2318, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11316.0] + - - [1024, 2320, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11329.0] + - - [1024, 2324, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11316.0] + - - [1024, 2325, 1, 29000, 1024, 1024, 1024, 29000] + - [7, 11330.0] + - - [1024, 2329, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11327.0] + - - [1024, 2338, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11362.0] + - - [1024, 2345, 1, 29000, 1024, 1024, 1024, 29000] + - [7, 11418.0] + - - [1024, 2350, 1, 29000, 1024, 1024, 1024, 29000] + - [0, 11449.0] + - - [1024, 2362, 1, 29000, 1024, 1024, 1024, 29000] + - [30, 11482.0] + - - [1024, 2366, 1, 29000, 1024, 1024, 1024, 29000] + - [30, 11501.0] + - - [1024, 2368, 1, 29000, 1024, 1024, 1024, 29000] + - [16, 11512.0] + - - [1024, 2374, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 11480.0] + - - [1024, 2390, 1, 29000, 1024, 1024, 1024, 29000] + - [5, 11554.0] + - - [512, 512, 320, 64, 512, 512, 512, 64] + - [0, 9166.0] + - - [512, 512, 80, 64, 512, 512, 512, 64] + - [16, 10845.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [1, 12219.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 2560, 4096] + - [5, 12265.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 4096, 2560] + - [8, 12023.0] + - - [1024, 1024, 512, 64, 1024, 1024, 1024, 64] + - [25, 8913.0] + - - [1024, 32768, 1, 3072, 1024, 1024, 1024, 3072] + - [3, 12534.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 1024, 4096] + - [26, 11827.0] + - - [1024, 32768, 1, 50304, 1024, 1024, 1024, 50304] + - [13, 12030.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 4096, 1024] + - [1, 12516.0] + - - [1024, 1024, 24, 128, 1024, 1024, 1024, 128] + - [11, 11708.0] + - - [128, 1024, 24, 1024, 128, 128, 128, 1024] + - [26, 11159.0] + - - [768, 320, 1, 30522, 768, 768, 768, 30522] + - [42, 11110.0] + - - [768, 640, 1, 30522, 768, 768, 768, 30522] + - [45, 11813.0] + - - [768, 1280, 1, 30522, 768, 768, 768, 30522] + - [43, 12220.0] + - - [1024, 780, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10787.0] + - - [1024, 308, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10826.0] + - - [1024, 800, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 11043.0] + - - [1024, 820, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 11313.0] + - - [1024, 385, 1, 30522, 1024, 1024, 1024, 30522] + - [47, 9250.0] + - - [1024, 462, 1, 30522, 1024, 1024, 1024, 30522] + - [42, 10207.0] + - - [1024, 640, 1, 30528, 1024, 1024, 1024, 30528] + - [43, 11961.0] + - - [2048, 199, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 8752.0] + - - [2048, 221, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 9718.0] + - - [2048, 224, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 9836.0] + - - [2048, 229, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 10055.0] + - - [2048, 234, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 10263.0] + - - [2048, 242, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 10608.0] + - - [2048, 246, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 10779.0] + - - [2048, 247, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 10820.0] + - - [2048, 256, 1, 29000, 2048, 2048, 2048, 29000] + - [46, 11217.0] + - - [2048, 262, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 9421.0] + - - [2048, 264, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 9490.0] + - - [2048, 265, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 9526.0] + - - [2048, 274, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 9846.0] + - - [2048, 277, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 9954.0] + - - [2048, 279, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 10021.0] + - - [2048, 288, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 10333.0] + - - [2048, 296, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 10623.0] + - - [2048, 315, 1, 29000, 2048, 2048, 2048, 29000] + - [47, 11263.0] + - - [2048, 335, 1, 29000, 2048, 2048, 2048, 29000] + - [48, 10210.0] + - - [1024, 561, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10828.0] + - - [1024, 574, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 11027.0] + - - [1024, 600, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11274.0] + - - [1024, 608, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11403.0] + - - [1024, 615, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11525.0] + - - [1024, 622, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11649.0] + - - [1024, 625, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11712.0] + - - [1024, 626, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11708.0] + - - [1024, 628, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11778.0] + - - [1024, 636, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11909.0] + - - [1024, 651, 1, 29000, 1024, 1024, 1024, 29000] + - [44, 10481.0] + - - [1024, 658, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10594.0] + - - [1024, 669, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10752.0] + - - [1024, 670, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10780.0] + - - [1024, 672, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10815.0] + - - [1024, 684, 1, 29000, 1024, 1024, 1024, 29000] + - [42, 10996.0] + - - [1024, 716, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 10909.0] + - - [1024, 730, 1, 29000, 1024, 1024, 1024, 29000] + - [43, 11093.0] + - - [1600, 512, 1, 1024, 1600, 1600, 1600, 1024] + - [65, 9003.0] + - - [1024, 512, 1, 1, 1024, 1024, 1024, 1] + - [100, 127.0] + - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] + - [50, 3902.0] + - - [2048, 512, 1, 1, 2048, 2048, 2048, 1] + - [49, 175.0] + - - [768, 640, 1, 768, 768, 768, 768, 768] + - [53, 9185.0] + - - [768, 1024, 1, 2, 768, 768, 768, 2] + - [54, 296.0] + - - [768, 1024, 1, 768, 768, 768, 768, 768] + - [78, 9875.0] + - - [768, 1280, 1, 768, 768, 768, 768, 768] + - [53, 10468.0] + - - [768, 512, 1, 2, 768, 768, 768, 2] + - [107, 161.0] + - - [768, 512, 1, 768, 768, 768, 768, 768] + - [53, 7512.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 8095.0] + - - [1024, 512, 1, 2, 1024, 1024, 1024, 2] + - [66, 256.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [56, 6779.0] + - - [64, 64, 96, 64, 64, 64, 64, 64] + - [98, 4695.0] + - - [704, 1024, 1, 128, 704, 704, 704, 128] + - [51, 7702.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 1024, 3328] + - [53, 10306.0] + - - [1856, 448, 1, 3328, 1856, 1856, 1856, 3328] + - [54, 9720.0] + - - [128, 6784, 1, 3328, 128, 128, 128, 3328] + - [59, 9868.0] + - - [2368, 448, 1, 128, 2368, 2368, 2368, 128] + - [97, 8424.0] + - - [256, 4288, 1, 3328, 256, 256, 256, 3328] + - [53, 10767.0] + - - [704, 1856, 1, 3328, 704, 704, 704, 3328] + - [78, 10038.0] + - - [448, 1024, 1, 1280, 448, 448, 448, 1280] + - [106, 8245.0] + - - [256, 1408, 1, 3328, 256, 256, 256, 3328] + - [53, 7622.0] + - - [704, 1856, 1, 1280, 704, 704, 704, 1280] + - [99, 9742.0] + - - [128, 5056, 1, 128, 128, 128, 128, 128] + - [53, 6846.0] + - - [2368, 128, 1, 256, 2368, 2368, 2368, 256] + - [51, 6643.0] + - - [64, 5056, 1, 256, 64, 64, 64, 256] + - [108, 5523.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [64, 7792.0] + - - [256, 1856, 1, 1280, 256, 256, 256, 1280] + - [87, 9368.0] + - - [4288, 256, 1, 256, 4288, 4288, 4288, 256] + - [53, 9190.0] + - - [2944, 128, 1, 128, 2944, 2944, 2944, 128] + - [52, 4525.0] + - - [5888, 64, 1, 3328, 5888, 5888, 5888, 3328] + - [69, 7876.0] + - - [2944, 256, 1, 3328, 2944, 2944, 2944, 3328] + - [87, 10106.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1408, 1280] + - [82, 9914.0] + - - [1408, 704, 1, 3328, 1408, 1408, 1408, 3328] + - [64, 9725.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1408, 1280] + - [99, 7261.0] + - - [3072, 128, 1, 1024, 3072, 3072, 3072, 1024] + - [64, 7877.0] + - - [6784, 64, 1, 256, 6784, 6784, 6784, 256] + - [62, 6351.0] + - - [2944, 256, 1, 256, 2944, 2944, 2944, 256] + - [53, 8210.0] + - - [704, 1408, 1, 3328, 704, 704, 704, 3328] + - [53, 9648.0] + - - [2944, 256, 1, 128, 2944, 2944, 2944, 128] + - [53, 6563.0] + - - [2368, 128, 1, 3328, 2368, 2368, 2368, 3328] + - [100, 8672.0] + - - [64, 193600, 1, 64, 64, 64, 64, 64] + - [89, 8367.0] + - - [448, 1408, 1, 256, 448, 448, 448, 256] + - [85, 8544.0] + - - [64, 5056, 1, 3328, 64, 64, 64, 3328] + - [67, 8469.0] + - - [512, 1500, 1, 2816, 512, 512, 512, 2816] + - [64, 10206.0] + - - [1024, 448, 1, 128, 1024, 1024, 1024, 128] + - [97, 5160.0] + - - [256, 3584, 1, 3328, 256, 256, 256, 3328] + - [65, 10436.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [82, 5992.0] + - - [5056, 64, 1, 1280, 5056, 5056, 5056, 1280] + - [51, 8790.0] + - - [1024, 704, 1, 256, 1024, 1024, 1024, 256] + - [53, 7907.0] + - - [128, 4288, 1, 128, 128, 128, 128, 128] + - [99, 5338.0] + - - [3584, 256, 1, 128, 3584, 3584, 3584, 128] + - [53, 7232.0] + - - [448, 1024, 1, 256, 448, 448, 448, 256] + - [106, 6734.0] + - - [5888, 64, 1, 256, 5888, 5888, 5888, 256] + - [109, 6380.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1856, 1280] + - [87, 9137.0] + - - [64, 5888, 1, 3328, 64, 64, 64, 3328] + - [55, 7123.0] + - - [448, 1856, 1, 128, 448, 448, 448, 128] + - [97, 6711.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1024, 1280] + - [78, 9300.0] + - - [128, 5888, 1, 256, 128, 128, 128, 256] + - [109, 7824.0] + - - [704, 704, 1, 3328, 704, 704, 704, 3328] + - [65, 8340.0] + - - [704, 1408, 1, 1280, 704, 704, 704, 1280] + - [78, 9563.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3584, 3328] + - [65, 10785.0] + - - [704, 1856, 1, 128, 704, 704, 704, 128] + - [97, 8041.0] + - - [128, 3584, 1, 3328, 128, 128, 128, 3328] + - [64, 9642.0] + - - [2944, 448, 1, 128, 2944, 2944, 2944, 128] + - [78, 8024.0] + - - [64, 193600, 1, 256, 64, 64, 64, 256] + - [111, 7886.0] + - - [128, 2944, 1, 1280, 128, 128, 128, 1280] + - [64, 7807.0] + - - [448, 2944, 1, 1280, 448, 448, 448, 1280] + - [54, 9084.0] + - - [3584, 128, 1, 256, 3584, 3584, 3584, 256] + - [53, 6703.0] + - - [448, 1408, 1, 3328, 448, 448, 448, 3328] + - [81, 9105.0] + - - [704, 1024, 1, 256, 704, 704, 704, 256] + - [64, 7690.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [87, 8461.0] + - - [256, 2944, 1, 3328, 256, 256, 256, 3328] + - [64, 10258.0] + - - [448, 2368, 1, 128, 448, 448, 448, 128] + - [97, 7412.0] + - - [1408, 704, 1, 256, 1408, 1408, 1408, 256] + - [82, 8579.0] + - - [448, 2944, 1, 3328, 448, 448, 448, 3328] + - [79, 9372.0] + - - [64, 5888, 1, 256, 64, 64, 64, 256] + - [63, 5365.0] + - - [512, 1500, 1, 2048, 512, 512, 512, 2048] + - [64, 9985.0] + - - [6784, 128, 1, 3328, 6784, 6784, 6784, 3328] + - [54, 10160.0] + - - [704, 704, 1, 256, 704, 704, 704, 256] + - [64, 6903.0] + - - [448, 704, 1, 1280, 448, 448, 448, 1280] + - [51, 8159.0] + - - [1024, 448, 1, 3328, 1024, 1024, 1024, 3328] + - [53, 9522.0] + - - [2944, 128, 1, 256, 2944, 2944, 2944, 256] + - [78, 5708.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1024, 1280] + - [78, 9981.0] + - - [448, 1024, 1, 128, 448, 448, 448, 128] + - [85, 5178.0] + - - [448, 2368, 1, 3328, 448, 448, 448, 3328] + - [54, 9345.0] + - - [5056, 64, 1, 128, 5056, 5056, 5056, 128] + - [86, 5769.0] + - - [1024, 700, 1, 512, 1024, 1024, 1024, 512] + - [99, 8943.0] + - - [128, 6784, 1, 1280, 128, 128, 128, 1280] + - [72, 9699.0] + - - [1856, 256, 1, 256, 1856, 1856, 1856, 256] + - [85, 7847.0] + - - [128, 5888, 1, 1280, 128, 128, 128, 1280] + - [87, 9684.0] + - - [256, 4288, 1, 1280, 256, 256, 256, 1280] + - [109, 10313.0] + - - [256, 1856, 1, 128, 256, 256, 256, 128] + - [64, 5307.0] + - - [7680, 64, 1, 2560, 7680, 7680, 7680, 2560] + - [57, 10125.0] + - - [448, 1408, 1, 128, 448, 448, 448, 128] + - [58, 6136.0] + - - [6784, 128, 1, 256, 6784, 6784, 6784, 256] + - [99, 8842.0] + - - [704, 448, 1, 256, 704, 704, 704, 256] + - [85, 5670.0] + - - [704, 448, 1, 128, 704, 704, 704, 128] + - [105, 4350.0] + - - [704, 1408, 1, 128, 704, 704, 704, 128] + - [97, 7463.0] + - - [4288, 128, 1, 1280, 4288, 4288, 4288, 1280] + - [79, 8644.0] + - - [128, 2944, 1, 128, 128, 128, 128, 128] + - [80, 4902.0] + - - [128, 4288, 1, 256, 128, 128, 128, 256] + - [87, 6685.0] + - - [704, 448, 1, 3328, 704, 704, 704, 3328] + - [51, 8509.0] + - - [448, 2368, 1, 1280, 448, 448, 448, 1280] + - [65, 8980.0] + - - [64, 6784, 1, 3328, 64, 64, 64, 3328] + - [67, 7856.0] + - - [2944, 256, 1, 1280, 2944, 2944, 2944, 1280] + - [64, 9925.0] + - - [256, 2368, 1, 128, 256, 256, 256, 128] + - [106, 6006.0] + - - [1856, 704, 1, 256, 1856, 1856, 1856, 256] + - [106, 8711.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1856, 1280] + - [54, 9367.0] + - - [128, 5888, 1, 128, 128, 128, 128, 128] + - [85, 6737.0] + - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] + - [64, 8842.0] + - - [704, 1856, 1, 256, 704, 704, 704, 256] + - [78, 8697.0] + - - [256, 2368, 1, 1280, 256, 256, 256, 1280] + - [88, 9456.0] + - - [2944, 448, 1, 256, 2944, 2944, 2944, 256] + - [78, 9111.0] + - - [1856, 448, 1, 128, 1856, 1856, 1856, 128] + - [76, 6719.0] + - - [2368, 128, 1, 1280, 2368, 2368, 2368, 1280] + - [53, 7729.0] + - - [64, 6784, 1, 256, 64, 64, 64, 256] + - [89, 5580.0] + - - [64, 5056, 1, 1280, 64, 64, 64, 1280] + - [67, 7725.0] + - - [3025, 64, 64, 64, 3025, 3025, 3025, 64] + - [78, 7480.0] + - - [2368, 256, 1, 1280, 2368, 2368, 2368, 1280] + - [54, 9887.0] + - - [2368, 448, 1, 1280, 2368, 2368, 2368, 1280] + - [109, 10084.0] + - - [128, 3584, 1, 256, 128, 128, 128, 256] + - [109, 6742.0] + - - [704, 448, 1, 1280, 704, 704, 704, 1280] + - [106, 7959.0] + - - [4288, 256, 1, 1280, 4288, 4288, 4288, 1280] + - [78, 10303.0] + - - [4288, 128, 1, 3328, 4288, 4288, 4288, 3328] + - [83, 8779.0] + - - [7680, 128, 1, 2560, 7680, 7680, 7680, 2560] + - [54, 11220.0] + - - [1408, 256, 1, 128, 1408, 1408, 1408, 128] + - [63, 4982.0] + - - [256, 1408, 1, 1280, 256, 256, 256, 1280] + - [53, 7257.0] + - - [6784, 64, 1, 3328, 6784, 6784, 6784, 3328] + - [91, 9068.0] + - - [128, 2944, 1, 3328, 128, 128, 128, 3328] + - [64, 7912.0] + - - [2944, 448, 1, 3328, 2944, 2944, 2944, 3328] + - [53, 10228.0] + - - [5888, 128, 1, 256, 5888, 5888, 5888, 256] + - [85, 8389.0] + - - [5056, 64, 1, 256, 5056, 5056, 5056, 256] + - [90, 6228.0] + - - [512, 1500, 1, 1536, 512, 512, 512, 1536] + - [64, 9822.0] + - - [128, 3584, 1, 1280, 128, 128, 128, 1280] + - [87, 9003.0] + - - [1024, 704, 1, 128, 1024, 1024, 1024, 128] + - [64, 6426.0] + - - [128, 5056, 1, 3328, 128, 128, 128, 3328] + - [72, 10501.0] + - - [1024, 1024, 1, 128, 1024, 1024, 1024, 128] + - [64, 7600.0] + - - [4288, 128, 1, 256, 4288, 4288, 4288, 256] + - [76, 6622.0] + - - [1408, 448, 1, 128, 1408, 1408, 1408, 128] + - [78, 6348.0] + - - [3584, 256, 1, 256, 3584, 3584, 3584, 256] + - [88, 8784.0] + - - [128, 2944, 1, 256, 128, 128, 128, 256] + - [64, 5602.0] + - - [128, 6784, 1, 128, 128, 128, 128, 128] + - [64, 6688.0] + - - [448, 1856, 1, 256, 448, 448, 448, 256] + - [99, 7701.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3584, 3328] + - [53, 9729.0] + - - [5888, 128, 1, 3328, 5888, 5888, 5888, 3328] + - [64, 10132.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1408, 1280] + - [87, 9688.0] + - - [448, 2944, 1, 256, 448, 448, 448, 256] + - [88, 8035.0] + - - [448, 2368, 1, 256, 448, 448, 448, 256] + - [87, 7863.0] + - - [64, 6784, 1, 1280, 64, 64, 64, 1280] + - [89, 7914.0] + - - [128, 2368, 1, 3328, 128, 128, 128, 3328] + - [91, 8650.0] + - - [5056, 64, 1, 3328, 5056, 5056, 5056, 3328] + - [57, 8872.0] + - - [64, 5888, 1, 128, 64, 64, 64, 128] + - [63, 4314.0] + - - [5056, 128, 1, 3328, 5056, 5056, 5056, 3328] + - [79, 10786.0] + - - [448, 704, 1, 256, 448, 448, 448, 256] + - [85, 5734.0] + - - [2944, 128, 1, 3328, 2944, 2944, 2944, 3328] + - [99, 7891.0] + - - [128, 5056, 1, 1280, 128, 128, 128, 1280] + - [64, 10172.0] + - - [704, 704, 1, 128, 704, 704, 704, 128] + - [62, 5003.0] + - - [2368, 128, 1, 128, 2368, 2368, 2368, 128] + - [63, 4154.0] + - - [5056, 128, 1, 128, 5056, 5056, 5056, 128] + - [64, 6145.0] + - - [448, 1024, 1, 3328, 448, 448, 448, 3328] + - [55, 8490.0] + - - [2368, 256, 1, 256, 2368, 2368, 2368, 256] + - [53, 7425.0] + - - [256, 2368, 1, 3328, 256, 256, 256, 3328] + - [88, 10002.0] + - - [256, 3584, 1, 128, 256, 256, 256, 128] + - [53, 6868.0] + - - [4288, 256, 1, 128, 4288, 4288, 4288, 128] + - [53, 7712.0] + - - [448, 1856, 1, 3328, 448, 448, 448, 3328] + - [54, 9568.0] + - - [2368, 256, 1, 128, 2368, 2368, 2368, 128] + - [51, 5996.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [64, 6935.0] + - - [256, 2944, 1, 128, 256, 256, 256, 128] + - [78, 6448.0] + - - [1408, 256, 1, 3328, 1408, 1408, 1408, 3328] + - [64, 7554.0] + - - [2368, 448, 1, 256, 2368, 2368, 2368, 256] + - [53, 8835.0] + - - [4288, 256, 1, 3328, 4288, 4288, 4288, 3328] + - [53, 10861.0] + - - [1856, 704, 1, 128, 1856, 1856, 1856, 128] + - [51, 8198.0] + - - [4288, 128, 1, 128, 4288, 4288, 4288, 128] + - [57, 5884.0] + - - [6784, 64, 1, 1280, 6784, 6784, 6784, 1280] + - [99, 8485.0] + - - [3584, 128, 1, 128, 3584, 3584, 3584, 128] + - [62, 5097.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [53, 7313.0] + - - [2944, 448, 1, 1280, 2944, 2944, 2944, 1280] + - [78, 9941.0] + - - [448, 1408, 1, 1280, 448, 448, 448, 1280] + - [106, 8659.0] + - - [448, 1856, 1, 1280, 448, 448, 448, 1280] + - [65, 9148.0] + - - [1856, 256, 1, 128, 1856, 1856, 1856, 128] + - [86, 6168.0] + - - [128, 2368, 1, 256, 128, 128, 128, 256] + - [87, 5623.0] + - - [5888, 64, 1, 1280, 5888, 5888, 5888, 1280] + - [109, 7532.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1024, 1280] + - [53, 9045.0] + - - [128, 5056, 1, 256, 128, 128, 128, 256] + - [91, 7600.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1856, 1280] + - [78, 9715.0] + - - [448, 2944, 1, 128, 448, 448, 448, 128] + - [85, 7896.0] + - - [1408, 256, 1, 256, 1408, 1408, 1408, 256] + - [78, 5586.0] + - - [2368, 448, 1, 3328, 2368, 2368, 2368, 3328] + - [109, 10355.0] + - - [128, 5888, 1, 3328, 128, 128, 128, 3328] + - [64, 10030.0] + - - [64, 5056, 1, 128, 64, 64, 64, 128] + - [108, 4226.0] + - - [64, 6784, 1, 128, 64, 64, 64, 128] + - [85, 4616.0] + - - [448, 704, 1, 128, 448, 448, 448, 128] + - [77, 4286.0] + - - [1408, 448, 1, 256, 1408, 1408, 1408, 256] + - [53, 7955.0] + - - [1408, 704, 1, 128, 1408, 1408, 1408, 128] + - [78, 7359.0] + - - [2368, 256, 1, 3328, 2368, 2368, 2368, 3328] + - [54, 10221.0] + - - [5888, 128, 1, 1280, 5888, 5888, 5888, 1280] + - [53, 9589.0] + - - [256, 3584, 1, 1280, 256, 256, 256, 1280] + - [88, 10076.0] + - - [256, 1408, 1, 128, 256, 256, 256, 128] + - [64, 4288.0] + - - [256, 4288, 1, 128, 256, 256, 256, 128] + - [109, 7466.0] + - - [5888, 128, 1, 128, 5888, 5888, 5888, 128] + - [99, 6431.0] + - - [1408, 448, 1, 3328, 1408, 1408, 1408, 3328] + - [53, 10217.0] + - - [704, 1024, 1, 1280, 704, 704, 704, 1280] + - [78, 9304.0] + - - [1856, 256, 1, 3328, 1856, 1856, 1856, 3328] + - [109, 9813.0] + - - [64, 5888, 1, 1280, 64, 64, 64, 1280] + - [111, 7171.0] + - - [6784, 64, 1, 128, 6784, 6784, 6784, 128] + - [99, 4944.0] + - - [704, 704, 1, 1280, 704, 704, 704, 1280] + - [99, 7836.0] + - - [128, 2368, 1, 1280, 128, 128, 128, 1280] + - [91, 7866.0] + - - [3584, 256, 1, 1280, 3584, 3584, 3584, 1280] + - [110, 10291.0] + - - [128, 4288, 1, 3328, 128, 128, 128, 3328] + - [59, 9106.0] + - - [3584, 128, 1, 1280, 3584, 3584, 3584, 1280] + - [109, 9383.0] + - - [5056, 128, 1, 1280, 5056, 5056, 5056, 1280] + - [54, 10172.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [64, 8804.0] + - - [1024, 448, 1, 256, 1024, 1024, 1024, 256] + - [53, 6788.0] + - - [2944, 128, 1, 1280, 2944, 2944, 2944, 1280] + - [87, 7813.0] + - - [128, 2368, 1, 128, 128, 128, 128, 128] + - [87, 5359.0] + - - [256, 2944, 1, 1280, 256, 256, 256, 1280] + - [64, 9612.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [116, 9398.0] + - - [704, 1024, 1, 3328, 704, 704, 704, 3328] + - [53, 9528.0] + - - [128, 6784, 1, 256, 128, 128, 128, 256] + - [64, 8119.0] + - - [256, 1856, 1, 3328, 256, 256, 256, 3328] + - [87, 9832.0] + - - [6784, 128, 1, 128, 6784, 6784, 6784, 128] + - [87, 8066.0] + - - [128, 3584, 1, 128, 128, 128, 128, 128] + - [53, 6510.0] + - - [704, 1408, 1, 256, 704, 704, 704, 256] + - [64, 8929.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [65, 8926.0] + - - [5888, 64, 1, 128, 5888, 5888, 5888, 128] + - [53, 4441.0] + - - [5056, 128, 1, 256, 5056, 5056, 5056, 256] + - [62, 7742.0] + - - [6784, 128, 1, 1280, 6784, 6784, 6784, 1280] + - [54, 9786.0] + - - [1856, 448, 1, 256, 1856, 1856, 1856, 256] + - [53, 8026.0] + - - [1024, 704, 1, 3328, 1024, 1024, 1024, 3328] + - [99, 9665.0] + - - [128, 4288, 1, 1280, 128, 128, 128, 1280] + - [109, 8762.0] + - - [448, 704, 1, 3328, 448, 448, 448, 3328] + - [51, 8348.0] + - - [1856, 704, 1, 3328, 1856, 1856, 1856, 3328] + - [53, 10078.0] + - - [512, 1500, 1, 2560, 512, 512, 512, 2560] + - [64, 10102.0] + - - [3136, 64, 128, 64, 3136, 3136, 3136, 64] + - [75, 5085.0] + - - [3136, 64, 128, 256, 3136, 3136, 3136, 256] + - [57, 8791.0] + - - [3136, 64, 256, 64, 3136, 3136, 3136, 64] + - [49, 5209.0] + - - [3136, 64, 256, 256, 3136, 3136, 3136, 256] + - [57, 8741.0] + - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] + - [54, 8808.0] + - - [4096, 256, 1, 2048, 4096, 4096, 4096, 2048] + - [53, 10324.0] + - - [2048, 256, 1, 4096, 2048, 2048, 2048, 4096] + - [83, 8873.0] + - - [512, 768, 1, 2048, 512, 512, 512, 2048] + - [87, 8008.0] + - - [2048, 256, 1, 1024, 2048, 2048, 2048, 1024] + - [88, 8179.0] + - - [2048, 200, 1, 512, 2048, 2048, 2048, 512] + - [99, 5914.0] + - - [4096, 200, 1, 1024, 4096, 4096, 4096, 1024] + - [99, 7793.0] + - - [2048, 200, 1, 4096, 2048, 2048, 2048, 4096] + - [100, 6932.0] + - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] + - [78, 10054.0] + - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] + - [78, 9577.0] + - - [2048, 512, 1, 4096, 2048, 2048, 2048, 4096] + - [87, 10312.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 1024, 4096] + - [64, 10295.0] + - - [4096, 200, 1, 2048, 4096, 4096, 4096, 2048] + - [51, 8014.0] + - - [2048, 200, 1, 1024, 2048, 2048, 2048, 1024] + - [99, 6402.0] + - - [1024, 768, 1, 512, 1024, 1024, 1024, 512] + - [53, 9395.0] + - - [2048, 200, 1, 2048, 2048, 2048, 2048, 2048] + - [79, 6738.0] + - - [2048, 256, 1, 2048, 2048, 2048, 2048, 2048] + - [79, 8578.0] + - - [512, 768, 1, 512, 512, 512, 512, 512] + - [64, 7079.0] + - - [4096, 256, 1, 4096, 4096, 4096, 4096, 4096] + - [53, 10370.0] + - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] + - [65, 7951.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] + - [64, 10071.0] + - - [4096, 256, 1, 1024, 4096, 4096, 4096, 1024] + - [78, 9873.0] + - - [512, 768, 1, 1024, 512, 512, 512, 1024] + - [64, 7623.0] + - - [1024, 512, 1, 4096, 1024, 1024, 1024, 4096] + - [93, 8762.0] + - - [4096, 200, 1, 4096, 4096, 4096, 4096, 4096] + - [99, 7869.0] + - - [2048, 256, 1, 512, 2048, 2048, 2048, 512] + - [100, 7714.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 9836.0] + - - [4096, 192, 1, 2048, 4096, 4096, 4096, 2048] + - [78, 10247.0] + - - [5329, 64, 64, 160, 5329, 5329, 5329, 160] + - [52, 7488.0] + - - [1225, 64, 64, 384, 1225, 1225, 1225, 384] + - [103, 9104.0] + - - [4096, 320, 1, 1280, 4096, 4096, 4096, 1280] + - [53, 11255.0] + - - [4096, 192, 1, 1280, 4096, 4096, 4096, 1280] + - [99, 10060.0] + - - [1225, 96, 64, 384, 1225, 1225, 1225, 384] + - [83, 8047.0] + - - [4096, 320, 1, 2048, 4096, 4096, 4096, 2048] + - [53, 11368.0] + - - [4096, 256, 1, 1536, 4096, 4096, 4096, 1536] + - [78, 10085.0] + - - [64, 147, 432, 148, 64, 64, 64, 148] + - [76, 7599.0] + - - [64, 123, 528, 123, 64, 64, 64, 123] + - [101, 7590.0] + - - [64, 111, 576, 112, 64, 64, 64, 112] + - [51, 7643.0] + - - [64, 77, 816, 77, 64, 64, 64, 77] + - [106, 5948.0] + - - [64, 92, 688, 92, 64, 64, 64, 92] + - [97, 7037.0] + - - [64, 159, 400, 159, 64, 64, 64, 159] + - [101, 7290.0] + - - [64, 85, 752, 84, 64, 64, 64, 84] + - [76, 6727.0] + - - [64, 122, 528, 123, 64, 64, 64, 123] + - [55, 7524.0] + - - [64, 93, 688, 92, 64, 64, 64, 92] + - [97, 6832.0] + - - [64, 102, 624, 99, 64, 64, 64, 99] + - [76, 7370.0] + - - [64, 133, 480, 133, 64, 64, 64, 133] + - [51, 7306.0] + - - [64, 232, 272, 232, 64, 64, 64, 232] + - [55, 8048.0] + - - [64, 162, 400, 159, 64, 64, 64, 159] + - [81, 7269.0] + - - [64, 78, 816, 78, 64, 64, 64, 78] + - [97, 6050.0] + - - [64, 99, 624, 99, 64, 64, 64, 99] + - [85, 7433.0] + - - [64, 101, 624, 102, 64, 64, 64, 102] + - [97, 7461.0] + - - [64, 111, 576, 111, 64, 64, 64, 111] + - [76, 7847.0] + - - [64, 134, 480, 134, 64, 64, 64, 134] + - [51, 7320.0] + - - [64, 135, 480, 132, 64, 64, 64, 132] + - [97, 7483.0] + - - [64, 134, 480, 132, 64, 64, 64, 132] + - [51, 7125.0] + - - [64, 134, 480, 135, 64, 64, 64, 135] + - [51, 7378.0] + - - [64, 162, 400, 162, 64, 64, 64, 162] + - [55, 7422.0] + - - [64, 102, 624, 102, 64, 64, 64, 102] + - [62, 7449.0] + - - [64, 135, 480, 133, 64, 64, 64, 133] + - [97, 7250.0] + - - [64, 148, 432, 143, 64, 64, 64, 143] + - [97, 7435.0] + - - [64, 100, 624, 100, 64, 64, 64, 100] + - [76, 7515.0] + - - [64, 65, 992, 65, 64, 64, 64, 65] + - [106, 5053.0] + - - [64, 122, 528, 122, 64, 64, 64, 122] + - [51, 8447.0] + - - [64, 228, 272, 228, 64, 64, 64, 228] + - [55, 7972.0] + - - [64, 112, 576, 111, 64, 64, 64, 111] + - [55, 7385.0] + - - [64, 143, 432, 143, 64, 64, 64, 143] + - [76, 7528.0] + - - [64, 135, 480, 135, 64, 64, 64, 135] + - [76, 7279.0] + - - [64, 232, 272, 228, 64, 64, 64, 228] + - [81, 8074.0] + - - [64, 193, 320, 193, 64, 64, 64, 193] + - [97, 7404.0] + - - [64, 71, 896, 71, 64, 64, 64, 71] + - [97, 5633.0] + - - [64, 84, 752, 84, 64, 64, 64, 84] + - [51, 6282.0] + - - [64, 132, 480, 132, 64, 64, 64, 132] + - [76, 7229.0] + - - [64, 85, 752, 85, 64, 64, 64, 85] + - [76, 6563.0] + - - [64, 102, 624, 100, 64, 64, 64, 100] + - [97, 7485.0] + - - [64, 78, 816, 77, 64, 64, 64, 77] + - [51, 6090.0] + - - [64, 112, 576, 112, 64, 64, 64, 112] + - [97, 8173.0] + - - [64, 148, 432, 148, 64, 64, 64, 148] + - [97, 7088.0] + - - [64, 159, 400, 160, 64, 64, 64, 160] + - [97, 8080.0] + - - [64, 102, 624, 101, 64, 64, 64, 101] + - [76, 7400.0] + - - [64, 101, 624, 101, 64, 64, 64, 101] + - [85, 7391.0] + - - [64, 160, 400, 160, 64, 64, 64, 160] + - [97, 8117.0] + - - [64, 93, 688, 93, 64, 64, 64, 93] + - [76, 6859.0] + - - [64, 147, 432, 147, 64, 64, 64, 147] + - [51, 6883.0] + - - [64, 100, 624, 102, 64, 64, 64, 102] + - [76, 7428.0] + - - [64, 177, 352, 177, 64, 64, 64, 177] + - [55, 7895.0] + - - [500, 1024, 1, 512, 500, 500, 500, 512] + - [53, 7858.0] + - - [512, 1024, 1, 512, 512, 512, 512, 512] + - [87, 7674.0] + - - [200, 2048, 1, 512, 200, 200, 200, 512] + - [109, 5894.0] + - - [512, 2000, 1, 1024, 512, 512, 512, 1024] + - [64, 9520.0] + - - [512, 2048, 1, 512, 512, 512, 512, 512] + - [109, 9512.0] + - - [200, 2000, 1, 100, 200, 200, 200, 100] + - [50, 3636.0] + - - [200, 2000, 1, 1024, 200, 200, 200, 1024] + - [109, 6200.0] + - - [500, 1024, 1, 2048, 500, 500, 500, 2048] + - [67, 8159.0] + - - [512, 2048, 1, 100, 512, 512, 512, 100] + - [54, 7114.0] + - - [512, 2048, 1, 2000, 512, 512, 512, 2000] + - [99, 10205.0] + - - [200, 2000, 1, 10, 200, 200, 200, 10] + - [70, 1149.0] + - - [500, 2048, 1, 1024, 500, 500, 500, 1024] + - [64, 9488.0] + - - [500, 2000, 1, 10, 500, 500, 500, 10] + - [114, 1374.0] + - - [500, 2048, 1, 100, 500, 500, 500, 100] + - [76, 6975.0] + - - [512, 1024, 1, 500, 512, 512, 512, 500] + - [65, 8096.0] + - - [200, 2000, 1, 2000, 200, 200, 200, 2000] + - [110, 6736.0] + - - [500, 2048, 1, 2000, 500, 500, 500, 2000] + - [53, 9936.0] + - - [512, 2048, 1, 1024, 512, 512, 512, 1024] + - [64, 9744.0] + - - [512, 1024, 1, 100, 512, 512, 512, 100] + - [53, 4690.0] + - - [256, 2000, 1, 10, 256, 256, 256, 10] + - [49, 1103.0] + - - [512, 2000, 1, 100, 512, 512, 512, 100] + - [64, 6267.0] + - - [512, 2000, 1, 2048, 512, 512, 512, 2048] + - [87, 9814.0] + - - [500, 1024, 1, 500, 500, 500, 500, 500] + - [76, 7331.0] + - - [256, 2000, 1, 100, 256, 256, 256, 100] + - [78, 4369.0] + - - [512, 1024, 1, 2048, 512, 512, 512, 2048] + - [116, 8495.0] + - - [500, 2048, 1, 2048, 500, 500, 500, 2048] + - [87, 9862.0] + - - [200, 2048, 1, 10, 200, 200, 200, 10] + - [49, 716.0] + - - [500, 2000, 1, 512, 500, 500, 500, 512] + - [87, 8957.0] + - - [500, 1024, 1, 1024, 500, 500, 500, 1024] + - [109, 7772.0] + - - [200, 2000, 1, 500, 200, 200, 200, 500] + - [87, 6105.0] + - - [256, 2048, 1, 100, 256, 256, 256, 100] + - [97, 5484.0] + - - [500, 2000, 1, 1024, 500, 500, 500, 1024] + - [87, 9352.0] + - - [256, 2048, 1, 1024, 256, 256, 256, 1024] + - [109, 8054.0] + - - [200, 2048, 1, 1024, 200, 200, 200, 1024] + - [109, 6367.0] + - - [512, 2048, 1, 500, 512, 512, 512, 500] + - [78, 9623.0] + - - [512, 2000, 1, 10, 512, 512, 512, 10] + - [57, 1620.0] + - - [500, 1024, 1, 2000, 500, 500, 500, 2000] + - [54, 8488.0] + - - [512, 2000, 1, 512, 512, 512, 512, 512] + - [87, 9240.0] + - - [500, 2000, 1, 2000, 500, 500, 500, 2000] + - [78, 9694.0] + - - [500, 1024, 1, 10, 500, 500, 500, 10] + - [104, 842.0] + - - [256, 2048, 1, 10, 256, 256, 256, 10] + - [84, 851.0] + - - [256, 2048, 1, 500, 256, 256, 256, 500] + - [109, 7620.0] + - - [256, 2048, 1, 2048, 256, 256, 256, 2048] + - [116, 8464.0] + - - [256, 2000, 1, 512, 256, 256, 256, 512] + - [109, 7314.0] + - - [512, 1024, 1, 2000, 512, 512, 512, 2000] + - [100, 8810.0] + - - [256, 2000, 1, 2000, 256, 256, 256, 2000] + - [54, 8515.0] + - - [256, 2048, 1, 2000, 256, 256, 256, 2000] + - [54, 8818.0] + - - [200, 2048, 1, 100, 200, 200, 200, 100] + - [89, 3814.0] + - - [200, 2000, 1, 2048, 200, 200, 200, 2048] + - [87, 6448.0] + - - [500, 2048, 1, 512, 500, 500, 500, 512] + - [87, 9188.0] + - - [500, 2000, 1, 500, 500, 500, 500, 500] + - [78, 9019.0] + - - [200, 2048, 1, 2048, 200, 200, 200, 2048] + - [87, 6595.0] + - - [200, 2048, 1, 500, 200, 200, 200, 500] + - [103, 6034.0] + - - [512, 2000, 1, 500, 512, 512, 512, 500] + - [78, 9255.0] + - - [200, 2048, 1, 2000, 200, 200, 200, 2000] + - [54, 6798.0] + - - [500, 1024, 1, 100, 500, 500, 500, 100] + - [77, 4324.0] + - - [512, 1024, 1, 10, 512, 512, 512, 10] + - [55, 892.0] + - - [512, 1024, 1, 1024, 512, 512, 512, 1024] + - [72, 8137.0] + - - [500, 2048, 1, 10, 500, 500, 500, 10] + - [102, 1290.0] + - - [200, 2000, 1, 512, 200, 200, 200, 512] + - [109, 5756.0] + - - [256, 2000, 1, 500, 256, 256, 256, 500] + - [87, 7348.0] + - - [256, 2048, 1, 512, 256, 256, 256, 512] + - [87, 7639.0] + - - [256, 2000, 1, 2048, 256, 256, 256, 2048] + - [88, 8294.0] + - - [500, 2048, 1, 500, 500, 500, 500, 500] + - [97, 9306.0] + - - [256, 2000, 1, 1024, 256, 256, 256, 1024] + - [87, 7802.0] + - - [500, 2000, 1, 2048, 500, 500, 500, 2048] + - [64, 9591.0] + - - [512, 2000, 1, 2000, 512, 512, 512, 2000] + - [99, 9891.0] + - - [512, 2048, 1, 2048, 512, 512, 512, 2048] + - [64, 10072.0] + - - [512, 2048, 1, 10, 512, 512, 512, 10] + - [51, 1324.0] + - - [500, 2000, 1, 100, 500, 500, 500, 100] + - [85, 5988.0] + - - [1024, 1131, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 9645.0] + - - [1024, 1102, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 9439.0] + - - [1024, 774, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 8651.0] + - - [4096, 128, 1, 2048, 4096, 4096, 4096, 2048] + - [65, 8740.0] + - - [4096, 128, 1, 3072, 4096, 4096, 4096, 3072] + - [54, 8896.0] + - - [1024, 1120, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 9519.0] + - - [1024, 1015, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 9706.0] + - - [1024, 992, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 9536.0] + - - [1024, 950, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 10475.0] + - - [1024, 1088, 1, 1024, 1024, 1024, 1024, 1024] + - [87, 10362.0] + - - [64, 128, 96, 128, 64, 64, 64, 128] + - [58, 5707.0] + - - [768, 1024, 1, 3072, 768, 768, 768, 3072] + - [64, 10305.0] + - - [768, 512, 1, 3072, 768, 768, 768, 3072] + - [109, 8259.0] + - - [64, 256, 192, 256, 64, 64, 64, 256] + - [111, 8061.0] + - - [64, 128, 384, 128, 64, 64, 64, 128] + - [101, 8295.0] + - - [64, 256, 96, 256, 64, 64, 64, 256] + - [67, 8480.0] + - - [6272, 112, 1, 512, 6272, 6272, 6272, 512] + - [99, 8137.0] + - - [2048, 320, 1, 1280, 2048, 2048, 2048, 1280] + - [109, 10102.0] + - - [5329, 64, 1, 448, 5329, 5329, 5329, 448] + - [109, 5403.0] + - - [784, 64, 32, 192, 784, 784, 784, 192] + - [87, 8024.0] + - - [6272, 64, 1, 480, 6272, 6272, 6272, 480] + - [64, 6818.0] + - - [6272, 64, 1, 512, 6272, 6272, 6272, 512] + - [106, 6962.0] + - - [6272, 160, 1, 528, 6272, 6272, 6272, 528] + - [78, 8119.0] + - - [289, 160, 32, 768, 289, 289, 289, 768] + - [53, 6759.0] + - - [5329, 64, 32, 160, 5329, 5329, 5329, 160] + - [52, 7533.0] + - - [5329, 96, 1, 576, 5329, 5329, 5329, 576] + - [53, 6476.0] + - - [1225, 64, 32, 288, 1225, 1225, 1225, 288] + - [99, 10626.0] + - - [289, 192, 32, 768, 289, 289, 289, 768] + - [64, 8285.0] + - - [2048, 448, 1, 1280, 2048, 2048, 2048, 1280] + - [53, 10327.0] + - - [3136, 64, 32, 64, 3136, 3136, 3136, 64] + - [99, 10333.0] + - - [6272, 128, 1, 528, 6272, 6272, 6272, 528] + - [53, 10314.0] + - - [6272, 96, 1, 480, 6272, 6272, 6272, 480] + - [53, 7495.0] + - - [2048, 448, 1, 2048, 2048, 2048, 2048, 2048] + - [109, 10168.0] + - - [784, 96, 32, 192, 784, 784, 784, 192] + - [106, 7454.0] + - - [1001, 512, 1, 4096, 1001, 1001, 1001, 4096] + - [59, 8631.0] + - - [2048, 192, 1, 1280, 2048, 2048, 2048, 1280] + - [78, 7784.0] + - - [1225, 64, 32, 256, 1225, 1225, 1225, 256] + - [99, 10342.0] + - - [2048, 256, 1, 1536, 2048, 2048, 2048, 1536] + - [65, 8717.0] + - - [6272, 128, 1, 512, 6272, 6272, 6272, 512] + - [87, 9458.0] + - - [1568, 384, 1, 832, 1568, 1568, 1568, 832] + - [79, 9192.0] + - - [1568, 256, 1, 832, 1568, 1568, 1568, 832] + - [53, 7583.0] + - - [1568, 192, 1, 832, 1568, 1568, 1568, 832] + - [51, 7495.0] + - - [289, 192, 32, 1024, 289, 289, 289, 1024] + - [109, 8312.0] + - - [1225, 64, 32, 384, 1225, 1225, 1225, 384] + - [53, 9844.0] + - - [2048, 320, 1, 2048, 2048, 2048, 2048, 2048] + - [53, 10644.0] + - - [2048, 384, 1, 1536, 2048, 2048, 2048, 1536] + - [99, 10087.0] + - - [5041, 96, 1, 576, 5041, 5041, 5041, 576] + - [53, 6697.0] + - - [6272, 192, 1, 480, 6272, 6272, 6272, 480] + - [99, 9650.0] + - - [5041, 192, 1, 720, 5041, 5041, 5041, 720] + - [99, 9935.0] + - - [289, 128, 32, 768, 289, 289, 289, 768] + - [110, 7790.0] + - - [12544, 64, 1, 147, 12544, 12544, 12544, 147] + - [64, 6637.0] + - - [6272, 160, 1, 512, 6272, 6272, 6272, 512] + - [99, 8066.0] + - - [1225, 64, 32, 192, 1225, 1225, 1225, 192] + - [97, 10107.0] + - - [784, 64, 32, 256, 784, 784, 784, 256] + - [87, 9014.0] + - - [6272, 144, 1, 512, 6272, 6272, 6272, 512] + - [78, 7643.0] + - - [8192, 192, 1, 1280, 8192, 8192, 8192, 1280] + - [53, 10670.0] + - - [8192, 192, 1, 2048, 8192, 8192, 8192, 2048] + - [53, 10904.0] + - - [65, 6400, 1, 1024, 65, 65, 65, 1024] + - [109, 5121.0] + - - [512, 1290, 1, 2048, 512, 512, 512, 2048] + - [109, 8749.0] + - - [512, 2205, 1, 2048, 512, 512, 512, 2048] + - [87, 10711.0] + - - [64, 512, 16, 512, 64, 64, 64, 512] + - [111, 6765.0] + - - [512, 600, 1, 2048, 512, 512, 512, 2048] + - [93, 8509.0] + - - [512, 644, 1, 512, 512, 512, 512, 512] + - [78, 6366.0] + - - [512, 644, 1, 2048, 512, 512, 512, 2048] + - [64, 6691.0] + - - [512, 668, 1, 2048, 512, 512, 512, 2048] + - [87, 6946.0] + - - [512, 714, 1, 512, 512, 512, 512, 512] + - [64, 6694.0] + - - [512, 714, 1, 2048, 512, 512, 512, 2048] + - [87, 7401.0] + - - [512, 720, 1, 512, 512, 512, 512, 512] + - [87, 6567.0] + - - [512, 720, 1, 2048, 512, 512, 512, 2048] + - [113, 7572.0] + - - [512, 722, 1, 2048, 512, 512, 512, 2048] + - [53, 7616.0] + - - [512, 781, 1, 512, 512, 512, 512, 512] + - [62, 7271.0] + - - [512, 781, 1, 2048, 512, 512, 512, 2048] + - [53, 8088.0] + - - [512, 848, 1, 2048, 512, 512, 512, 2048] + - [64, 8748.0] + - - [512, 872, 1, 2048, 512, 512, 512, 2048] + - [64, 8956.0] + - - [512, 936, 1, 512, 512, 512, 512, 512] + - [109, 8250.0] + - - [512, 936, 1, 2048, 512, 512, 512, 2048] + - [64, 9581.0] + - - [512, 980, 1, 512, 512, 512, 512, 512] + - [88, 7551.0] + - - [512, 980, 1, 2048, 512, 512, 512, 2048] + - [65, 8356.0] + - - [512, 1139, 1, 2048, 512, 512, 512, 2048] + - [110, 9426.0] + - - [512, 1184, 1, 2048, 512, 512, 512, 2048] + - [88, 9733.0] + - - [512, 1186, 1, 2048, 512, 512, 512, 2048] + - [88, 9887.0] + - - [512, 1232, 1, 512, 512, 512, 512, 512] + - [87, 9201.0] + - - [512, 1232, 1, 2048, 512, 512, 512, 2048] + - [65, 10299.0] + - - [512, 1279, 1, 2048, 512, 512, 512, 2048] + - [65, 10408.0] + - - [512, 1290, 1, 512, 512, 512, 512, 512] + - [109, 7778.0] + - - [512, 1327, 1, 2048, 512, 512, 512, 2048] + - [64, 8844.0] + - - [512, 1331, 1, 2048, 512, 512, 512, 2048] + - [64, 8828.0] + - - [512, 1341, 1, 2048, 512, 512, 512, 2048] + - [109, 8878.0] + - - [512, 1350, 1, 512, 512, 512, 512, 512] + - [109, 8265.0] + - - [512, 1350, 1, 2048, 512, 512, 512, 2048] + - [64, 9021.0] + - - [512, 1359, 1, 2048, 512, 512, 512, 2048] + - [64, 9093.0] + - - [512, 1391, 1, 2048, 512, 512, 512, 2048] + - [64, 9267.0] + - - [512, 1424, 1, 512, 512, 512, 512, 512] + - [64, 8930.0] + - - [512, 1424, 1, 2048, 512, 512, 512, 2048] + - [87, 9494.0] + - - [512, 1458, 1, 512, 512, 512, 512, 512] + - [109, 9170.0] + - - [512, 1458, 1, 2048, 512, 512, 512, 2048] + - [53, 9761.0] + - - [512, 1462, 1, 512, 512, 512, 512, 512] + - [109, 8703.0] + - - [512, 1462, 1, 2048, 512, 512, 512, 2048] + - [87, 9807.0] + - - [512, 1467, 1, 2048, 512, 512, 512, 2048] + - [87, 9643.0] + - - [512, 1472, 1, 2048, 512, 512, 512, 2048] + - [87, 9790.0] + - - [512, 1520, 1, 512, 512, 512, 512, 512] + - [109, 9064.0] + - - [512, 1520, 1, 2048, 512, 512, 512, 2048] + - [87, 10070.0] + - - [512, 1596, 1, 512, 512, 512, 512, 512] + - [64, 9415.0] + - - [512, 1596, 1, 2048, 512, 512, 512, 2048] + - [64, 10461.0] + - - [512, 1599, 1, 512, 512, 512, 512, 512] + - [64, 9315.0] + - - [512, 1599, 1, 2048, 512, 512, 512, 2048] + - [64, 10515.0] + - - [512, 1615, 1, 512, 512, 512, 512, 512] + - [87, 8518.0] + - - [512, 1615, 1, 2048, 512, 512, 512, 2048] + - [65, 9285.0] + - - [512, 1680, 1, 512, 512, 512, 512, 512] + - [88, 9179.0] + - - [512, 1680, 1, 2048, 512, 512, 512, 2048] + - [110, 9671.0] + - - [512, 1709, 1, 2048, 512, 512, 512, 2048] + - [65, 9809.0] + - - [512, 1890, 1, 512, 512, 512, 512, 512] + - [64, 9580.0] + - - [512, 1902, 1, 2048, 512, 512, 512, 2048] + - [65, 10850.0] + - - [512, 1917, 1, 512, 512, 512, 512, 512] + - [109, 9671.0] + - - [512, 1917, 1, 2048, 512, 512, 512, 2048] + - [88, 10860.0] + - - [512, 2076, 1, 2048, 512, 512, 512, 2048] + - [64, 10156.0] + - - [512, 2195, 1, 2048, 512, 512, 512, 2048] + - [64, 10693.0] + - - [512, 2205, 1, 512, 512, 512, 512, 512] + - [87, 9830.0] + - - [2048, 198, 1, 512, 2048, 2048, 2048, 512] + - [99, 5875.0] + - - [2048, 207, 1, 512, 2048, 2048, 2048, 512] + - [99, 6205.0] + - - [2048, 208, 1, 512, 2048, 2048, 2048, 512] + - [64, 6480.0] + - - [2048, 245, 1, 512, 2048, 2048, 2048, 512] + - [99, 7148.0] + - - [2048, 246, 1, 512, 2048, 2048, 2048, 512] + - [99, 7201.0] + - - [2048, 264, 1, 512, 2048, 2048, 2048, 512] + - [99, 7707.0] + - - [2048, 401, 1, 512, 2048, 2048, 2048, 512] + - [99, 8396.0] + - - [2048, 439, 1, 512, 2048, 2048, 2048, 512] + - [99, 9112.0] + - - [2048, 443, 1, 512, 2048, 2048, 2048, 512] + - [78, 9206.0] + - - [2048, 446, 1, 512, 2048, 2048, 2048, 512] + - [78, 9221.0] + - - [2048, 465, 1, 512, 2048, 2048, 2048, 512] + - [99, 8618.0] + - - [2048, 468, 1, 512, 2048, 2048, 2048, 512] + - [78, 8658.0] + - - [2048, 493, 1, 512, 2048, 2048, 2048, 512] + - [109, 9258.0] + - - [2048, 495, 1, 512, 2048, 2048, 2048, 512] + - [99, 9135.0] + - - [2048, 511, 1, 512, 2048, 2048, 2048, 512] + - [109, 9494.0] + - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] + - [87, 9737.0] + - - [2048, 540, 1, 512, 2048, 2048, 2048, 512] + - [53, 9118.0] + - - [2048, 550, 1, 512, 2048, 2048, 2048, 512] + - [64, 9335.0] + - - [2048, 560, 1, 512, 2048, 2048, 2048, 512] + - [78, 9434.0] + - - [2048, 600, 1, 512, 2048, 2048, 2048, 512] + - [109, 9833.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [106, 5805.0] + - - [64, 65, 496, 64, 64, 64, 64, 64] + - [85, 4063.0] + - - [64, 65, 496, 65, 64, 64, 64, 65] + - [76, 3812.0] + - - [64, 70, 216, 70, 64, 64, 64, 70] + - [50, 3311.0] + - - [64, 71, 216, 71, 64, 64, 64, 71] + - [106, 3477.0] + - - [64, 78, 248, 77, 64, 64, 64, 77] + - [62, 4373.0] + - - [64, 80, 152, 80, 64, 64, 64, 80] + - [106, 3706.0] + - - [64, 93, 344, 93, 64, 64, 64, 93] + - [76, 5097.0] + - - [64, 102, 312, 102, 64, 64, 64, 102] + - [67, 5447.0] + - - [64, 122, 264, 122, 64, 64, 64, 122] + - [89, 6089.0] + - - [64, 122, 264, 123, 64, 64, 64, 123] + - [67, 6386.0] + - - [64, 123, 264, 123, 64, 64, 64, 123] + - [101, 6340.0] + - - [64, 512, 96, 512, 64, 64, 64, 512] + - [68, 7371.0] + - - [64, 512, 128, 512, 64, 64, 64, 512] + - [68, 7109.0] + - - [64, 128, 512, 128, 64, 64, 64, 128] + - [101, 8337.0] + - - [64, 512, 64, 512, 64, 64, 64, 512] + - [89, 8374.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [87, 10050.0] + - - [512, 1600, 1, 32, 512, 512, 512, 32] + - [86, 2972.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [87, 9745.0] + - - [560, 1600, 1, 1024, 560, 560, 560, 1024] + - [64, 8430.0] + - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] + - [65, 8710.0] + - - [64, 192, 64, 1280, 64, 64, 64, 1280] + - [95, 4875.0] + - - [64, 320, 64, 1280, 64, 64, 64, 1280] + - [89, 7607.0] + - - [64, 384, 64, 1280, 64, 64, 64, 1280] + - [94, 7260.0] + - - [64, 448, 64, 1280, 64, 64, 64, 1280] + - [94, 7304.0] + - - [64, 192, 64, 2048, 64, 64, 64, 2048] + - [73, 6586.0] + - - [64, 320, 64, 2048, 64, 64, 64, 2048] + - [95, 6675.0] + - - [64, 384, 64, 2048, 64, 64, 64, 2048] + - [118, 6492.0] + - - [64, 448, 64, 2048, 64, 64, 64, 2048] + - [94, 6545.0] + - - [1225, 64, 64, 192, 1225, 1225, 1225, 192] + - [78, 10545.0] + - - [1225, 64, 64, 256, 1225, 1225, 1225, 256] + - [103, 10040.0] + - - [1225, 64, 64, 288, 1225, 1225, 1225, 288] + - [99, 10633.0] + - - [5329, 80, 64, 64, 5329, 5329, 5329, 64] + - [76, 5506.0] + - - [3136, 64, 64, 64, 3136, 3136, 3136, 64] + - [99, 10050.0] + - - [3136, 64, 64, 256, 3136, 3136, 3136, 256] + - [103, 8923.0] + - - [64, 192, 32, 1280, 64, 64, 64, 1280] + - [89, 7654.0] + - - [64, 320, 32, 1280, 64, 64, 64, 1280] + - [67, 9130.0] + - - [64, 384, 32, 1280, 64, 64, 64, 1280] + - [60, 6778.0] + - - [64, 448, 32, 1280, 64, 64, 64, 1280] + - [89, 7626.0] + - - [64, 192, 32, 2048, 64, 64, 64, 2048] + - [112, 5322.0] + - - [64, 320, 32, 2048, 64, 64, 64, 2048] + - [111, 7172.0] + - - [64, 384, 32, 2048, 64, 64, 64, 2048] + - [111, 7300.0] + - - [64, 448, 32, 2048, 64, 64, 64, 2048] + - [67, 6817.0] + - - [5329, 80, 32, 64, 5329, 5329, 5329, 64] + - [78, 6213.0] + - - [3136, 64, 32, 256, 3136, 3136, 3136, 256] + - [82, 9391.0] + - - [196, 256, 32, 1024, 196, 196, 196, 1024] + - [64, 8475.0] + - - [256, 4096, 1, 4, 256, 256, 256, 4] + - [61, 510.0] + - - [960, 1024, 1, 1024, 960, 960, 960, 1024] + - [64, 9232.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [57, 9078.0] + - - [768, 768, 1, 384, 768, 768, 768, 384] + - [78, 8248.0] + - - [100, 128, 120, 512, 100, 100, 100, 512] + - [87, 8433.0] + - - [100, 128, 139, 512, 100, 100, 100, 512] + - [64, 8495.0] + - - [100, 128, 160, 512, 100, 100, 100, 512] + - [109, 8520.0] + - - [22500, 64, 1, 147, 22500, 22500, 22500, 147] + - [99, 8563.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [64, 10836.0] + - - [1024, 616, 1, 1024, 1024, 1024, 1024, 1024] + - [88, 9524.0] + - - [64, 128, 128, 128, 64, 64, 64, 128] + - [112, 5542.0] + - - [64, 128, 160, 128, 64, 64, 64, 128] + - [71, 5962.0] + - - [1024, 1024, 1, 2, 1024, 1024, 1024, 2] + - [77, 277.0] + - - [64, 128, 624, 128, 64, 64, 64, 128] + - [92, 7084.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 9062.0] + - - [64, 128, 640, 128, 64, 64, 64, 128] + - [111, 7963.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 9239.0] + - - [64, 128, 656, 128, 64, 64, 64, 128] + - [115, 6891.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 9470.0] + - - [64, 512, 80, 512, 64, 64, 64, 512] + - [89, 7392.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 7728.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [53, 7656.0] + - - [64, 128, 144, 128, 64, 64, 64, 128] + - [56, 5954.0] + - - [1024, 960, 1, 64, 1024, 1024, 1024, 64] + - [78, 5252.0] + - - [64, 512, 256, 512, 64, 64, 64, 512] + - [117, 7372.0] + - - [64, 512, 40, 512, 64, 64, 64, 512] + - [89, 7626.0] + - - [96, 1024, 64, 1024, 96, 96, 96, 1024] + - [93, 8192.0] + - - [96, 1024, 128, 1024, 96, 96, 96, 1024] + - [69, 8270.0] + - - [64, 1024, 256, 1024, 64, 64, 64, 1024] + - [94, 7320.0] + - - [64, 1024, 32, 1024, 64, 64, 64, 1024] + - [117, 6963.0] + - - [64, 1024, 64, 1024, 64, 64, 64, 1024] + - [73, 7185.0] + - - [64, 1024, 128, 1024, 64, 64, 64, 1024] + - [94, 7233.0] + - - [64, 128, 1024, 128, 64, 64, 64, 128] + - [74, 5456.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [65, 9671.0] + - - [1024, 864, 1, 512, 1024, 1024, 1024, 512] + - [99, 8995.0] + - - [256, 3456, 1, 128, 256, 256, 256, 128] + - [109, 6806.0] + - - [256, 4096, 1, 128, 256, 256, 256, 128] + - [53, 7318.0] + - - [480, 864, 1, 1024, 480, 480, 480, 1024] + - [87, 7888.0] + - - [512, 864, 1, 256, 512, 512, 512, 256] + - [109, 6284.0] + - - [64, 128, 1280, 128, 64, 64, 64, 128] + - [96, 5318.0] + - - [64, 128, 1312, 128, 64, 64, 64, 128] + - [119, 5329.0] + - - [64, 512, 192, 512, 64, 64, 64, 512] + - [73, 7247.0] + - - [256, 4096, 1, 1, 256, 256, 256, 1] + - [77, 202.0] + - - [64, 128, 2048, 128, 64, 64, 64, 128] + - [96, 5370.0] + - - [64, 128, 1536, 128, 64, 64, 64, 128] + - [96, 5352.0] + - - [64, 128, 192, 128, 64, 64, 64, 128] + - [92, 6811.0] + - - [64, 384, 144, 384, 64, 64, 64, 384] + - [67, 7926.0] + - - [64, 512, 48, 512, 64, 64, 64, 512] + - [89, 8302.0] + - - [64, 128, 256, 128, 64, 64, 64, 128] + - [81, 6310.0] + - - [64, 384, 192, 384, 64, 64, 64, 384] + - [115, 7277.0] + - - [950, 512, 2, 2048, 950, 950, 950, 2048] + - [64, 9457.0] + - - [3400, 256, 1, 1024, 3400, 3400, 3400, 1024] + - [110, 9528.0] + - - [3800, 256, 1, 1024, 3800, 3800, 3800, 1024] + - [87, 10457.0] + - - [850, 512, 2, 2048, 850, 850, 850, 2048] + - [65, 9618.0] + - - [805, 512, 2, 2048, 805, 805, 805, 2048] + - [88, 9118.0] + - - [864, 512, 2, 2048, 864, 864, 864, 2048] + - [65, 9833.0] + - - [950, 256, 2, 2048, 950, 950, 950, 2048] + - [89, 8775.0] + - - [888, 512, 2, 2048, 888, 888, 888, 2048] + - [65, 10031.0] + - - [51520, 64, 2, 256, 51520, 51520, 51520, 256] + - [69, 9955.0] + - - [46464, 64, 2, 256, 46464, 46464, 46464, 256] + - [57, 10604.0] + - - [49152, 64, 2, 256, 49152, 49152, 49152, 256] + - [69, 8324.0] + - - [1900, 512, 1, 1024, 1900, 1900, 1900, 1024] + - [53, 10783.0] + - - [1700, 512, 1, 1024, 1700, 1700, 1700, 1024] + - [109, 9410.0] + - - [1610, 512, 1, 1024, 1610, 1610, 1610, 1024] + - [88, 9022.0] + - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] + - [87, 9945.0] + - - [1728, 512, 1, 1024, 1728, 1728, 1728, 1024] + - [88, 9521.0] + - - [1024, 1024, 1, 320, 1024, 1024, 1024, 320] + - [53, 9198.0] + - - [51520, 64, 2, 64, 51520, 51520, 51520, 64] + - [76, 10642.0] + - - [55296, 64, 2, 64, 55296, 55296, 55296, 64] + - [87, 10495.0] + - - [49152, 64, 2, 64, 49152, 49152, 49152, 64] + - [87, 9769.0] + - - [54400, 64, 2, 64, 54400, 54400, 54400, 64] + - [87, 10811.0] + - - [42240, 64, 2, 256, 42240, 42240, 42240, 256] + - [91, 10750.0] + - - [672, 512, 2, 2048, 672, 672, 672, 2048] + - [87, 9115.0] + - - [54400, 64, 2, 256, 54400, 54400, 54400, 256] + - [69, 9742.0] + - - [56832, 64, 2, 256, 56832, 56832, 56832, 256] + - [69, 9369.0] + - - [55296, 64, 2, 256, 55296, 55296, 55296, 256] + - [57, 9446.0] + - - [60800, 64, 2, 64, 60800, 60800, 60800, 64] + - [99, 10847.0] + - - [660, 512, 2, 2048, 660, 660, 660, 2048] + - [64, 9005.0] + - - [768, 512, 2, 2048, 768, 768, 768, 2048] + - [64, 10163.0] + - - [43008, 64, 2, 256, 43008, 43008, 43008, 256] + - [57, 10662.0] + - - [864, 256, 2, 2048, 864, 864, 864, 2048] + - [64, 9127.0] + - - [726, 512, 2, 2048, 726, 726, 726, 2048] + - [64, 9415.0] + - - [768, 256, 2, 2048, 768, 768, 768, 2048] + - [64, 7937.0] + - - [45632, 64, 2, 256, 45632, 45632, 45632, 256] + - [91, 10512.0] + - - [713, 512, 2, 2048, 713, 713, 713, 2048] + - [64, 9550.0] + - - [805, 256, 2, 2048, 805, 805, 805, 2048] + - [64, 8298.0] + - - [60800, 64, 2, 256, 60800, 60800, 60800, 256] + - [69, 9453.0] + - - [850, 256, 2, 2048, 850, 850, 850, 2048] + - [64, 8951.0] + - - [1024, 1024, 1, 81, 1024, 1024, 1024, 81] + - [51, 5716.0] + - - [96, 1024, 160, 1024, 96, 96, 96, 1024] + - [91, 8289.0] + - - [96, 1024, 40, 1024, 96, 96, 96, 1024] + - [72, 8117.0] + - - [96, 1024, 80, 1024, 96, 96, 96, 1024] + - [116, 8199.0] + - - [96, 1024, 96, 1024, 96, 96, 96, 1024] + - [113, 8218.0] + - - [96, 1024, 24, 1024, 96, 96, 96, 1024] + - [93, 8162.0] + - - [96, 1024, 48, 1024, 96, 96, 96, 1024] + - [69, 8045.0] + - - [96, 1024, 16, 1024, 96, 96, 96, 1024] + - [64, 8288.0] + - - [96, 1024, 32, 1024, 96, 96, 96, 1024] + - [93, 8155.0] + - - [64, 512, 320, 512, 64, 64, 64, 512] + - [94, 7377.0] + - - [64, 1024, 512, 1024, 64, 64, 64, 1024] + - [117, 7364.0] + - - [1024, 80, 1, 30522, 1024, 1024, 1024, 30522] + - [125, 5615.0] + - - [1024, 120, 1, 30522, 1024, 1024, 1024, 30522] + - [126, 8302.0] + - - [1024, 77, 1, 30522, 1024, 1024, 1024, 30522] + - [121, 5406.0] + - - [1024, 200, 1, 30522, 1024, 1024, 1024, 30522] + - [120, 8008.0] + - - [1024, 160, 1, 30522, 1024, 1024, 1024, 30522] + - [120, 8703.0] + - - [1024, 180, 1, 30522, 1024, 1024, 1024, 30522] + - [120, 9746.0] + - - [1024, 160, 1, 30528, 1024, 1024, 1024, 30528] + - [122, 8702.0] + - - [1024, 240, 1, 30528, 1024, 1024, 1024, 30528] + - [120, 9570.0] + - - [2560, 109, 1, 29000, 2560, 2560, 2560, 29000] + - [121, 9783.0] + - - [2560, 121, 1, 29000, 2560, 2560, 2560, 29000] + - [123, 10771.0] + - - [2560, 65, 1, 29000, 2560, 2560, 2560, 29000] + - [121, 5928.0] + - - [2560, 66, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 6014.0] + - - [2560, 67, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 6112.0] + - - [2560, 69, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 6285.0] + - - [2560, 70, 1, 29000, 2560, 2560, 2560, 29000] + - [123, 6361.0] + - - [2560, 71, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 6463.0] + - - [2560, 73, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 6641.0] + - - [2560, 74, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 6727.0] + - - [2560, 75, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 6820.0] + - - [2560, 77, 1, 29000, 2560, 2560, 2560, 29000] + - [121, 7001.0] + - - [2560, 78, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 7084.0] + - - [2560, 80, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 7263.0] + - - [2560, 81, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 7355.0] + - - [2560, 82, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 7445.0] + - - [2560, 83, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 7520.0] + - - [2560, 84, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 7602.0] + - - [2560, 88, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 7978.0] + - - [2560, 89, 1, 29000, 2560, 2560, 2560, 29000] + - [125, 8036.0] + - - [2560, 90, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 8150.0] + - - [2560, 92, 1, 29000, 2560, 2560, 2560, 29000] + - [123, 8296.0] + - - [2560, 95, 1, 29000, 2560, 2560, 2560, 29000] + - [124, 8592.0] + - - [2560, 98, 1, 29000, 2560, 2560, 2560, 29000] + - [126, 8844.0] + - - [512, 200, 1, 32, 512, 512, 512, 32] + - [178, 964.0] + - - [1024, 200, 1, 1, 1024, 1024, 1024, 1] + - [131, 103.0] + - - [512, 200, 1, 1, 512, 512, 512, 1] + - [157, 59.0] + - - [768, 320, 1, 768, 768, 768, 768, 768] + - [132, 4781.0] + - - [768, 160, 1, 768, 768, 768, 768, 768] + - [181, 4251.0] + - - [1024, 120, 1, 1024, 1024, 1024, 1024, 1024] + - [132, 3603.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4761.0] + - - [2368, 64, 1, 3328, 2368, 2368, 2368, 3328] + - [146, 4859.0] + - - [64, 3584, 1, 1280, 64, 64, 64, 1280] + - [172, 4893.0] + - - [1408, 64, 1, 128, 1408, 1408, 1408, 128] + - [170, 1634.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1408, 1280] + - [181, 3421.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [130, 4235.0] + - - [3072, 64, 1, 1024, 3072, 3072, 3072, 1024] + - [181, 4466.0] + - - [2944, 64, 1, 256, 2944, 2944, 2944, 256] + - [130, 3589.0] + - - [448, 448, 1, 3328, 448, 448, 448, 3328] + - [130, 4912.0] + - - [1024, 256, 1, 3328, 1024, 1024, 1024, 3328] + - [156, 4723.0] + - - [6144, 32, 1, 2560, 6144, 6144, 6144, 2560] + - [156, 4713.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1856, 1280] + - [193, 4574.0] + - - [704, 128, 1, 1280, 704, 704, 704, 1280] + - [130, 3441.0] + - - [4288, 64, 1, 3328, 4288, 4288, 4288, 3328] + - [181, 4917.0] + - - [64, 3584, 1, 3328, 64, 64, 64, 3328] + - [146, 4835.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [195, 4591.0] + - - [704, 256, 1, 128, 704, 704, 704, 128] + - [156, 2580.0] + - - [128, 1408, 1, 128, 128, 128, 128, 128] + - [130, 2639.0] + - - [1024, 256, 1, 256, 1024, 1024, 1024, 256] + - [156, 3866.0] + - - [448, 448, 1, 256, 448, 448, 448, 256] + - [130, 3712.0] + - - [7680, 32, 1, 2560, 7680, 7680, 7680, 2560] + - [130, 5015.0] + - - [128, 1024, 1, 3328, 128, 128, 128, 3328] + - [144, 4146.0] + - - [64, 1856, 1, 1280, 64, 64, 64, 1280] + - [144, 4456.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [156, 3875.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1024, 1280] + - [132, 3953.0] + - - [3072, 32, 1, 1024, 3072, 3072, 3072, 1024] + - [181, 3550.0] + - - [448, 256, 1, 3328, 448, 448, 448, 3328] + - [156, 4635.0] + - - [128, 1024, 1, 128, 128, 128, 128, 128] + - [144, 2151.0] + - - [128, 704, 1, 1280, 128, 128, 128, 1280] + - [168, 3639.0] + - - [1856, 128, 1, 3328, 1856, 1856, 1856, 3328] + - [132, 4958.0] + - - [35, 8457, 1, 1760, 35, 35, 35, 1760] + - [156, 2855.0] + - - [64, 2944, 1, 128, 64, 64, 64, 128] + - [181, 2650.0] + - - [8448, 32, 1, 2816, 8448, 8448, 8448, 2816] + - [156, 4744.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1408, 1280] + - [170, 4471.0] + - - [128, 1856, 1, 1280, 128, 128, 128, 1280] + - [146, 5109.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [130, 2884.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [156, 4673.0] + - - [128, 1856, 1, 128, 128, 128, 128, 128] + - [144, 3221.0] + - - [64, 1408, 1, 3328, 64, 64, 64, 3328] + - [144, 3691.0] + - - [128, 1408, 1, 256, 128, 128, 128, 256] + - [144, 3358.0] + - - [35, 8457, 1, 2560, 35, 35, 35, 2560] + - [144, 2812.0] + - - [4288, 64, 1, 128, 4288, 4288, 4288, 128] + - [170, 3205.0] + - - [256, 448, 1, 3328, 256, 256, 256, 3328] + - [181, 4699.0] + - - [64, 2368, 1, 1280, 64, 64, 64, 1280] + - [193, 4478.0] + - - [2368, 64, 1, 256, 2368, 2368, 2368, 256] + - [146, 4016.0] + - - [1024, 128, 1, 128, 1024, 1024, 1024, 128] + - [193, 2113.0] + - - [704, 128, 1, 3328, 704, 704, 704, 3328] + - [130, 3697.0] + - - [4288, 64, 1, 1280, 4288, 4288, 4288, 1280] + - [156, 4756.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [158, 4985.0] + - - [1408, 128, 1, 128, 1408, 1408, 1408, 128] + - [130, 2610.0] + - - [128, 1024, 1, 1280, 128, 128, 128, 1280] + - [130, 3924.0] + - - [2944, 64, 1, 128, 2944, 2944, 2944, 128] + - [195, 3250.0] + - - [1024, 128, 1, 3328, 1024, 1024, 1024, 3328] + - [182, 4108.0] + - - [704, 128, 1, 256, 704, 704, 704, 256] + - [193, 2293.0] + - - [448, 256, 1, 1280, 448, 448, 448, 1280] + - [156, 4325.0] + - - [64, 4288, 1, 3328, 64, 64, 64, 3328] + - [144, 5113.0] + - - [2944, 64, 1, 3328, 2944, 2944, 2944, 3328] + - [144, 4674.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1856, 1280] + - [158, 4789.0] + - - [64, 3584, 1, 256, 64, 64, 64, 256] + - [146, 3793.0] + - - [3584, 64, 1, 128, 3584, 3584, 3584, 128] + - [130, 3014.0] + - - [256, 1024, 1, 1280, 256, 256, 256, 1280] + - [156, 4685.0] + - - [64, 4288, 1, 128, 64, 64, 64, 128] + - [144, 3289.0] + - - [3584, 64, 1, 1280, 3584, 3584, 3584, 1280] + - [132, 4582.0] + - - [1408, 128, 1, 3328, 1408, 1408, 1408, 3328] + - [193, 4428.0] + - - [64, 2944, 1, 3328, 64, 64, 64, 3328] + - [144, 4760.0] + - - [64, 1856, 1, 256, 64, 64, 64, 256] + - [144, 2941.0] + - - [128, 1500, 1, 1280, 128, 128, 128, 1280] + - [144, 4459.0] + - - [35, 8457, 1, 4096, 35, 35, 35, 4096] + - [170, 2815.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [130, 3910.0] + - - [2368, 64, 1, 128, 2368, 2368, 2368, 128] + - [146, 3674.0] + - - [256, 1024, 1, 128, 256, 256, 256, 128] + - [130, 4132.0] + - - [64, 1408, 1, 128, 64, 64, 64, 128] + - [170, 2413.0] + - - [704, 256, 1, 3328, 704, 704, 704, 3328] + - [130, 4465.0] + - - [35, 8457, 1, 2048, 35, 35, 35, 2048] + - [144, 2856.0] + - - [64, 2944, 1, 256, 64, 64, 64, 256] + - [193, 4116.0] + - - [448, 256, 1, 128, 448, 448, 448, 128] + - [170, 2996.0] + - - [64, 1408, 1, 1280, 64, 64, 64, 1280] + - [193, 3657.0] + - - [1408, 128, 1, 256, 1408, 1408, 1408, 256] + - [185, 3490.0] + - - [64, 2944, 1, 1280, 64, 64, 64, 1280] + - [193, 4714.0] + - - [128, 704, 1, 128, 128, 128, 128, 128] + - [129, 2168.0] + - - [256, 448, 1, 1280, 256, 256, 256, 1280] + - [156, 4330.0] + - - [704, 256, 1, 1280, 704, 704, 704, 1280] + - [156, 4323.0] + - - [64, 2368, 1, 3328, 64, 64, 64, 3328] + - [170, 4727.0] + - - [1856, 64, 1, 128, 1856, 1856, 1856, 128] + - [158, 2686.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [144, 4739.0] + - - [704, 128, 1, 128, 704, 704, 704, 128] + - [155, 1716.0] + - - [256, 704, 1, 3328, 256, 256, 256, 3328] + - [130, 4514.0] + - - [256, 448, 1, 128, 256, 256, 256, 128] + - [156, 1957.0] + - - [64, 3584, 1, 128, 64, 64, 64, 128] + - [195, 3027.0] + - - [1024, 128, 1, 256, 1024, 1024, 1024, 256] + - [132, 3510.0] + - - [2944, 64, 1, 1280, 2944, 2944, 2944, 1280] + - [181, 4368.0] + - - [128, 1408, 1, 3328, 128, 128, 128, 3328] + - [193, 4528.0] + - - [1408, 64, 1, 256, 1408, 1408, 1408, 256] + - [156, 2266.0] + - - [64, 1856, 1, 128, 64, 64, 64, 128] + - [154, 2577.0] + - - [64, 2368, 1, 256, 64, 64, 64, 256] + - [144, 4033.0] + - - [1856, 128, 1, 128, 1856, 1856, 1856, 128] + - [132, 4200.0] + - - [2368, 64, 1, 1280, 2368, 2368, 2368, 1280] + - [170, 4447.0] + - - [4288, 64, 1, 256, 4288, 4288, 4288, 256] + - [181, 3831.0] + - - [64, 4288, 1, 1280, 64, 64, 64, 1280] + - [144, 4876.0] + - - [1408, 64, 1, 3328, 1408, 1408, 1408, 3328] + - [181, 3670.0] + - - [1024, 256, 1, 128, 1024, 1024, 1024, 128] + - [130, 3171.0] + - - [256, 704, 1, 128, 256, 256, 256, 128] + - [130, 2580.0] + - - [448, 448, 1, 1280, 448, 448, 448, 1280] + - [130, 4825.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1024, 1280] + - [193, 4721.0] + - - [128, 1024, 1, 256, 128, 128, 128, 256] + - [146, 3517.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3584, 3328] + - [172, 4831.0] + - - [256, 1024, 1, 3328, 256, 256, 256, 3328] + - [144, 4849.0] + - - [1856, 64, 1, 3328, 1856, 1856, 1856, 3328] + - [181, 4785.0] + - - [448, 256, 1, 256, 448, 448, 448, 256] + - [193, 2878.0] + - - [4608, 32, 1, 1536, 4608, 4608, 4608, 1536] + - [144, 4396.0] + - - [128, 704, 1, 256, 128, 128, 128, 256] + - [168, 2980.0] + - - [3584, 64, 1, 256, 3584, 3584, 3584, 256] + - [144, 4415.0] + - - [64, 1856, 1, 3328, 64, 64, 64, 3328] + - [144, 4937.0] + - - [128, 704, 1, 3328, 128, 128, 128, 3328] + - [170, 3818.0] + - - [128, 1856, 1, 256, 128, 128, 128, 256] + - [146, 4580.0] + - - [64, 4288, 1, 256, 64, 64, 64, 256] + - [193, 3835.0] + - - [1856, 64, 1, 256, 1856, 1856, 1856, 256] + - [170, 3792.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [175, 3788.0] + - - [256, 704, 1, 1280, 256, 256, 256, 1280] + - [181, 4318.0] + - - [64, 2368, 1, 128, 64, 64, 64, 128] + - [146, 3380.0] + - - [176, 1500, 1, 1408, 176, 176, 176, 1408] + - [156, 4232.0] + - - [1856, 128, 1, 256, 1856, 1856, 1856, 256] + - [195, 3975.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [132, 4004.0] + - - [64, 1408, 1, 256, 64, 64, 64, 256] + - [174, 2298.0] + - - [128, 1408, 1, 1280, 128, 128, 128, 1280] + - [170, 4323.0] + - - [128, 1856, 1, 3328, 128, 128, 128, 3328] + - [146, 4989.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [130, 4439.0] + - - [448, 448, 1, 128, 448, 448, 448, 128] + - [130, 2946.0] + - - [704, 256, 1, 256, 704, 704, 704, 256] + - [156, 3353.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 1024] + - [144, 4586.0] + - - [512, 200, 1, 512, 512, 512, 512, 512] + - [170, 3189.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4039.0] + - - [512, 256, 1, 1024, 512, 512, 512, 1024] + - [156, 3839.0] + - - [1024, 256, 1, 2048, 1024, 1024, 1024, 2048] + - [156, 4661.0] + - - [1024, 200, 1, 4096, 1024, 1024, 1024, 4096] + - [193, 4196.0] + - - [1024, 200, 1, 512, 1024, 1024, 1024, 512] + - [156, 4188.0] + - - [512, 200, 1, 1024, 512, 512, 512, 1024] + - [130, 4024.0] + - - [512, 256, 1, 512, 512, 512, 512, 512] + - [156, 3434.0] + - - [1024, 256, 1, 4096, 1024, 1024, 1024, 4096] + - [156, 4714.0] + - - [1024, 200, 1, 2048, 1024, 1024, 1024, 2048] + - [130, 4215.0] + - - [1024, 256, 1, 512, 1024, 1024, 1024, 512] + - [156, 4369.0] + - - [512, 200, 1, 2048, 512, 512, 512, 2048] + - [130, 4033.0] + - - [64, 32, 1984, 32, 64, 64, 64, 32] + - [170, 4058.0] + - - [64, 38, 1680, 38, 64, 64, 64, 38] + - [172, 3203.0] + - - [64, 59, 1088, 59, 64, 64, 64, 59] + - [181, 4927.0] + - - [64, 54, 1184, 54, 64, 64, 64, 54] + - [158, 4564.0] + - - [64, 49, 1296, 49, 64, 64, 64, 49] + - [130, 4041.0] + - - [64, 45, 1424, 45, 64, 64, 64, 45] + - [144, 3785.0] + - - [64, 35, 1808, 35, 64, 64, 64, 35] + - [156, 2889.0] + - - [64, 41, 1552, 41, 64, 64, 64, 41] + - [154, 3589.0] + - - [512, 512, 1, 1024, 512, 512, 512, 1024] + - [156, 4852.0] + - - [512, 512, 1, 2000, 512, 512, 512, 2000] + - [130, 4690.0] + - - [100, 1024, 1, 2048, 100, 100, 100, 2048] + - [193, 3156.0] + - - [100, 2000, 1, 1024, 100, 100, 100, 1024] + - [193, 3509.0] + - - [128, 2000, 1, 100, 128, 128, 128, 100] + - [181, 2753.0] + - - [64, 2000, 1, 1024, 64, 64, 64, 1024] + - [172, 3678.0] + - - [100, 1024, 1, 1024, 100, 100, 100, 1024] + - [191, 2979.0] + - - [128, 1024, 1, 512, 128, 128, 128, 512] + - [170, 3441.0] + - - [512, 500, 1, 2000, 512, 512, 512, 2000] + - [130, 4663.0] + - - [500, 512, 1, 100, 500, 500, 500, 100] + - [181, 2729.0] + - - [100, 1024, 1, 500, 100, 100, 100, 500] + - [193, 2991.0] + - - [128, 2000, 1, 512, 128, 128, 128, 512] + - [144, 4446.0] + - - [256, 1024, 1, 100, 256, 256, 256, 100] + - [130, 3799.0] + - - [200, 500, 1, 1024, 200, 200, 200, 1024] + - [191, 3214.0] + - - [100, 2000, 1, 512, 100, 100, 100, 512] + - [193, 3259.0] + - - [200, 512, 1, 100, 200, 200, 200, 100] + - [172, 1442.0] + - - [64, 2048, 1, 10, 64, 64, 64, 10] + - [129, 301.0] + - - [64, 2048, 1, 500, 64, 64, 64, 500] + - [132, 3901.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [144, 4361.0] + - - [500, 500, 1, 2000, 500, 500, 500, 2000] + - [144, 4463.0] + - - [256, 500, 1, 10, 256, 256, 256, 10] + - [174, 294.0] + - - [512, 500, 1, 512, 512, 512, 512, 512] + - [181, 4225.0] + - - [128, 1024, 1, 2000, 128, 128, 128, 2000] + - [158, 4099.0] + - - [100, 2000, 1, 2048, 100, 100, 100, 2048] + - [144, 3621.0] + - - [256, 512, 1, 10, 256, 256, 256, 10] + - [174, 293.0] + - - [64, 2000, 1, 2048, 64, 64, 64, 2048] + - [191, 3985.0] + - - [64, 2048, 1, 512, 64, 64, 64, 512] + - [146, 3389.0] + - - [64, 2000, 1, 10, 64, 64, 64, 10] + - [165, 424.0] + - - [128, 1024, 1, 500, 128, 128, 128, 500] + - [132, 3896.0] + - - [200, 512, 1, 1024, 200, 200, 200, 1024] + - [191, 3236.0] + - - [128, 2048, 1, 10, 128, 128, 128, 10] + - [181, 1016.0] + - - [64, 2048, 1, 100, 64, 64, 64, 100] + - [181, 2708.0] + - - [64, 2000, 1, 100, 64, 64, 64, 100] + - [182, 2602.0] + - - [200, 500, 1, 100, 200, 200, 200, 100] + - [154, 2058.0] + - - [500, 500, 1, 500, 500, 500, 500, 500] + - [181, 4467.0] + - - [128, 2048, 1, 512, 128, 128, 128, 512] + - [193, 4673.0] + - - [100, 2048, 1, 500, 100, 100, 100, 500] + - [181, 3657.0] + - - [500, 500, 1, 2048, 500, 500, 500, 2048] + - [144, 4574.0] + - - [128, 2000, 1, 2000, 128, 128, 128, 2000] + - [181, 4644.0] + - - [256, 500, 1, 1024, 256, 256, 256, 1024] + - [195, 3719.0] + - - [64, 2048, 1, 2000, 64, 64, 64, 2000] + - [182, 4101.0] + - - [100, 2048, 1, 1024, 100, 100, 100, 1024] + - [193, 3601.0] + - - [128, 1024, 1, 100, 128, 128, 128, 100] + - [182, 1820.0] + - - [256, 1024, 1, 2048, 256, 256, 256, 2048] + - [144, 4771.0] + - - [500, 512, 1, 512, 500, 500, 500, 512] + - [130, 4225.0] + - - [256, 500, 1, 2000, 256, 256, 256, 2000] + - [158, 4003.0] + - - [256, 512, 1, 100, 256, 256, 256, 100] + - [195, 1791.0] + - - [128, 2000, 1, 500, 128, 128, 128, 500] + - [181, 4200.0] + - - [200, 512, 1, 2048, 200, 200, 200, 2048] + - [142, 3219.0] + - - [64, 2048, 1, 2048, 64, 64, 64, 2048] + - [142, 4107.0] + - - [200, 1024, 1, 2048, 200, 200, 200, 2048] + - [144, 3722.0] + - - [512, 512, 1, 10, 512, 512, 512, 10] + - [181, 520.0] + - - [512, 500, 1, 10, 512, 512, 512, 10] + - [181, 540.0] + - - [200, 512, 1, 10, 200, 200, 200, 10] + - [129, 236.0] + - - [500, 500, 1, 1024, 500, 500, 500, 1024] + - [156, 4372.0] + - - [256, 1024, 1, 512, 256, 256, 256, 512] + - [193, 4332.0] + - - [256, 500, 1, 512, 256, 256, 256, 512] + - [156, 3327.0] + - - [200, 500, 1, 2048, 200, 200, 200, 2048] + - [168, 3140.0] + - - [100, 2000, 1, 10, 100, 100, 100, 10] + - [158, 402.0] + - - [100, 2048, 1, 2048, 100, 100, 100, 2048] + - [144, 3718.0] + - - [128, 1024, 1, 2048, 128, 128, 128, 2048] + - [170, 4068.0] + - - [100, 2000, 1, 500, 100, 100, 100, 500] + - [193, 3279.0] + - - [100, 2048, 1, 100, 100, 100, 100, 100] + - [156, 2188.0] + - - [100, 1024, 1, 10, 100, 100, 100, 10] + - [127, 449.0] + - - [100, 1024, 1, 2000, 100, 100, 100, 2000] + - [128, 3211.0] + - - [256, 512, 1, 500, 256, 256, 256, 500] + - [144, 3392.0] + - - [100, 2000, 1, 100, 100, 100, 100, 100] + - [181, 2128.0] + - - [128, 1024, 1, 10, 128, 128, 128, 10] + - [127, 493.0] + - - [100, 2048, 1, 10, 100, 100, 100, 10] + - [153, 402.0] + - - [512, 500, 1, 100, 512, 512, 512, 100] + - [170, 2747.0] + - - [128, 2000, 1, 1024, 128, 128, 128, 1024] + - [193, 4503.0] + - - [200, 1024, 1, 500, 200, 200, 200, 500] + - [156, 3371.0] + - - [256, 512, 1, 2000, 256, 256, 256, 2000] + - [179, 4123.0] + - - [256, 1024, 1, 2000, 256, 256, 256, 2000] + - [156, 4772.0] + - - [200, 512, 1, 500, 200, 200, 200, 500] + - [158, 3062.0] + - - [64, 2000, 1, 512, 64, 64, 64, 512] + - [172, 3257.0] + - - [200, 1024, 1, 100, 200, 200, 200, 100] + - [130, 2231.0] + - - [200, 1024, 1, 1024, 200, 200, 200, 1024] + - [193, 3608.0] + - - [500, 512, 1, 2000, 500, 500, 500, 2000] + - [144, 4586.0] + - - [200, 500, 1, 512, 200, 200, 200, 512] + - [128, 2610.0] + - - [256, 512, 1, 512, 256, 256, 256, 512] + - [146, 3434.0] + - - [512, 512, 1, 500, 512, 512, 512, 500] + - [130, 4323.0] + - - [100, 1024, 1, 512, 100, 100, 100, 512] + - [170, 2664.0] + - - [128, 1024, 1, 1024, 128, 128, 128, 1024] + - [146, 3833.0] + - - [200, 512, 1, 2000, 200, 200, 200, 2000] + - [142, 3368.0] + - - [256, 1024, 1, 500, 256, 256, 256, 500] + - [181, 4332.0] + - - [200, 1024, 1, 512, 200, 200, 200, 512] + - [156, 3387.0] + - - [256, 500, 1, 500, 256, 256, 256, 500] + - [158, 3309.0] + - - [256, 500, 1, 2048, 256, 256, 256, 2048] + - [191, 4029.0] + - - [512, 500, 1, 1024, 512, 512, 512, 1024] + - [156, 4510.0] + - - [256, 512, 1, 1024, 256, 256, 256, 1024] + - [172, 3839.0] + - - [128, 2048, 1, 1024, 128, 128, 128, 1024] + - [170, 4787.0] + - - [500, 512, 1, 500, 500, 500, 500, 500] + - [181, 4205.0] + - - [200, 500, 1, 500, 200, 200, 200, 500] + - [179, 2607.0] + - - [64, 2000, 1, 2000, 64, 64, 64, 2000] + - [128, 4029.0] + - - [128, 2000, 1, 2048, 128, 128, 128, 2048] + - [144, 4635.0] + - - [256, 1024, 1, 10, 256, 256, 256, 10] + - [198, 522.0] + - - [256, 1024, 1, 1024, 256, 256, 256, 1024] + - [193, 4655.0] + - - [500, 500, 1, 10, 500, 500, 500, 10] + - [184, 812.0] + - - [256, 500, 1, 100, 256, 256, 256, 100] + - [156, 1749.0] + - - [256, 512, 1, 2048, 256, 256, 256, 2048] + - [181, 4153.0] + - - [200, 1024, 1, 2000, 200, 200, 200, 2000] + - [130, 3830.0] + - - [100, 2048, 1, 512, 100, 100, 100, 512] + - [193, 3352.0] + - - [512, 500, 1, 2048, 512, 512, 512, 2048] + - [130, 4610.0] + - - [128, 2048, 1, 2000, 128, 128, 128, 2000] + - [181, 4779.0] + - - [500, 512, 1, 2048, 500, 500, 500, 2048] + - [130, 4749.0] + - - [200, 500, 1, 2000, 200, 200, 200, 2000] + - [179, 3160.0] + - - [500, 512, 1, 1024, 500, 500, 500, 1024] + - [130, 4539.0] + - - [100, 1024, 1, 100, 100, 100, 100, 100] + - [153, 1376.0] + - - [64, 2000, 1, 500, 64, 64, 64, 500] + - [195, 3340.0] + - - [128, 2048, 1, 2048, 128, 128, 128, 2048] + - [170, 4908.0] + - - [128, 2000, 1, 10, 128, 128, 128, 10] + - [184, 908.0] + - - [500, 512, 1, 10, 500, 500, 500, 10] + - [129, 901.0] + - - [200, 512, 1, 512, 200, 200, 200, 512] + - [146, 3013.0] + - - [512, 500, 1, 500, 512, 512, 512, 500] + - [179, 4919.0] + - - [512, 512, 1, 100, 512, 512, 512, 100] + - [170, 2819.0] + - - [500, 500, 1, 512, 500, 500, 500, 512] + - [156, 4150.0] + - - [128, 2048, 1, 500, 128, 128, 128, 500] + - [154, 4858.0] + - - [200, 500, 1, 10, 200, 200, 200, 10] + - [145, 224.0] + - - [100, 2048, 1, 2000, 100, 100, 100, 2000] + - [181, 3746.0] + - - [200, 1024, 1, 10, 200, 200, 200, 10] + - [129, 427.0] + - - [64, 2048, 1, 1024, 64, 64, 64, 1024] + - [193, 3837.0] + - - [100, 2000, 1, 2000, 100, 100, 100, 2000] + - [181, 3664.0] + - - [500, 500, 1, 100, 500, 500, 500, 100] + - [181, 2654.0] + - - [128, 2048, 1, 100, 128, 128, 128, 100] + - [130, 2856.0] + - - [4096, 64, 1, 2048, 4096, 4096, 4096, 2048] + - [144, 4771.0] + - - [4096, 91, 1, 2048, 4096, 4096, 4096, 2048] + - [181, 4798.0] + - - [4096, 86, 1, 3072, 4096, 4096, 4096, 3072] + - [130, 4634.0] + - - [4096, 49, 1, 2048, 4096, 4096, 4096, 2048] + - [170, 3555.0] + - - [4096, 91, 1, 3072, 4096, 4096, 4096, 3072] + - [130, 4856.0] + - - [4096, 64, 1, 3072, 4096, 4096, 4096, 3072] + - [156, 4699.0] + - - [4096, 63, 1, 3072, 4096, 4096, 4096, 3072] + - [156, 4623.0] + - - [4096, 96, 1, 2048, 4096, 4096, 4096, 2048] + - [181, 5078.0] + - - [4096, 32, 1, 2048, 4096, 4096, 4096, 2048] + - [181, 4022.0] + - - [4096, 49, 1, 3072, 4096, 4096, 4096, 3072] + - [181, 3635.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 3631.0] + - - [4096, 86, 1, 2048, 4096, 4096, 4096, 2048] + - [181, 4542.0] + - - [4096, 96, 1, 3072, 4096, 4096, 4096, 3072] + - [156, 5118.0] + - - [4096, 35, 1, 3072, 4096, 4096, 4096, 3072] + - [144, 2581.0] + - - [4096, 50, 1, 2048, 4096, 4096, 4096, 2048] + - [181, 3635.0] + - - [36548, 32, 1, 1024, 36548, 36548, 36548, 1024] + - [156, 4978.0] + - - [4096, 32, 1, 3072, 4096, 4096, 4096, 3072] + - [128, 4284.0] + - - [1024, 243, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4593.0] + - - [4096, 50, 1, 3072, 4096, 4096, 4096, 3072] + - [130, 3672.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [132, 3888.0] + - - [1024, 216, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4376.0] + - - [4096, 35, 1, 2048, 4096, 4096, 4096, 2048] + - [156, 2558.0] + - - [4096, 63, 1, 2048, 4096, 4096, 4096, 2048] + - [181, 4568.0] + - - [289, 256, 1, 1568, 289, 289, 289, 1568] + - [144, 3242.0] + - - [3025, 64, 1, 363, 3025, 3025, 3025, 363] + - [130, 3774.0] + - - [784, 32, 32, 192, 784, 784, 784, 192] + - [156, 4571.0] + - - [289, 256, 1, 2016, 289, 289, 289, 2016] + - [161, 3323.0] + - - [21609, 32, 1, 288, 21609, 21609, 21609, 288] + - [193, 5141.0] + - - [1225, 192, 1, 1728, 1225, 1225, 1225, 1728] + - [158, 4789.0] + - - [784, 96, 1, 800, 784, 784, 784, 800] + - [144, 2909.0] + - - [1225, 64, 1, 1200, 1225, 1225, 1225, 1200] + - [135, 3273.0] + - - [729, 192, 1, 1600, 729, 729, 729, 1600] + - [132, 4350.0] + - - [6272, 32, 1, 528, 6272, 6272, 6272, 528] + - [144, 4156.0] + - - [1568, 160, 1, 832, 1568, 1568, 1568, 832] + - [130, 4291.0] + - - [289, 256, 1, 1792, 289, 289, 289, 1792] + - [135, 3277.0] + - - [784, 32, 32, 256, 784, 784, 784, 256] + - [130, 4699.0] + - - [6272, 32, 1, 512, 6272, 6272, 6272, 512] + - [156, 4137.0] + - - [289, 384, 1, 3456, 289, 289, 289, 3456] + - [181, 4598.0] + - - [289, 384, 1, 2592, 289, 289, 289, 2592] + - [181, 4516.0] + - - [1225, 32, 32, 192, 1225, 1225, 1225, 192] + - [193, 4917.0] + - - [1568, 128, 1, 832, 1568, 1568, 1568, 832] + - [130, 4494.0] + - - [1225, 48, 32, 288, 1225, 1225, 1225, 288] + - [182, 4077.0] + - - [1001, 128, 1, 2048, 1001, 1001, 1001, 2048] + - [132, 4198.0] + - - [2048, 174, 1, 512, 2048, 2048, 2048, 512] + - [158, 4313.0] + - - [2048, 189, 1, 512, 2048, 2048, 2048, 512] + - [158, 4683.0] + - - [64, 35, 904, 35, 64, 64, 64, 35] + - [172, 2421.0] + - - [64, 103, 16, 103, 64, 64, 64, 103] + - [127, 1676.0] + - - [64, 104, 16, 103, 64, 64, 64, 103] + - [127, 1693.0] + - - [64, 123, 16, 112, 64, 64, 64, 112] + - [130, 1856.0] + - - [64, 123, 16, 123, 64, 64, 64, 123] + - [132, 1956.0] + - - [512, 540, 1, 512, 512, 512, 512, 512] + - [144, 4514.0] + - - [512, 540, 1, 2048, 512, 512, 512, 2048] + - [130, 5016.0] + - - [512, 550, 1, 512, 512, 512, 512, 512] + - [181, 4253.0] + - - [512, 550, 1, 2048, 512, 512, 512, 2048] + - [156, 4534.0] + - - [512, 560, 1, 512, 512, 512, 512, 512] + - [156, 4287.0] + - - [512, 560, 1, 2048, 512, 512, 512, 2048] + - [156, 4609.0] + - - [2048, 160, 1, 512, 2048, 2048, 2048, 512] + - [130, 4796.0] + - - [2048, 184, 1, 512, 2048, 2048, 2048, 512] + - [193, 4809.0] + - - [512, 160, 1, 2048, 512, 512, 512, 2048] + - [161, 3725.0] + - - [512, 174, 1, 2048, 512, 512, 512, 2048] + - [130, 3562.0] + - - [512, 182, 1, 512, 512, 512, 512, 512] + - [193, 3001.0] + - - [512, 184, 1, 512, 512, 512, 512, 512] + - [193, 2959.0] + - - [512, 184, 1, 2048, 512, 512, 512, 2048] + - [142, 3976.0] + - - [512, 189, 1, 512, 512, 512, 512, 512] + - [191, 3140.0] + - - [512, 189, 1, 2048, 512, 512, 512, 2048] + - [181, 3848.0] + - - [512, 198, 1, 2048, 512, 512, 512, 2048] + - [130, 4002.0] + - - [512, 206, 1, 512, 512, 512, 512, 512] + - [193, 3371.0] + - - [512, 207, 1, 2048, 512, 512, 512, 2048] + - [154, 4211.0] + - - [512, 208, 1, 512, 512, 512, 512, 512] + - [170, 3321.0] + - - [512, 208, 1, 2048, 512, 512, 512, 2048] + - [130, 4180.0] + - - [512, 224, 1, 512, 512, 512, 512, 512] + - [130, 3629.0] + - - [512, 245, 1, 2048, 512, 512, 512, 2048] + - [181, 3997.0] + - - [512, 246, 1, 512, 512, 512, 512, 512] + - [172, 3784.0] + - - [512, 246, 1, 2048, 512, 512, 512, 2048] + - [182, 3961.0] + - - [512, 264, 1, 512, 512, 512, 512, 512] + - [195, 3517.0] + - - [512, 264, 1, 2048, 512, 512, 512, 2048] + - [130, 4145.0] + - - [512, 401, 1, 2048, 512, 512, 512, 2048] + - [181, 4346.0] + - - [512, 439, 1, 2048, 512, 512, 512, 2048] + - [132, 4658.0] + - - [512, 443, 1, 2048, 512, 512, 512, 2048] + - [132, 4684.0] + - - [512, 446, 1, 2048, 512, 512, 512, 2048] + - [132, 4691.0] + - - [512, 455, 1, 512, 512, 512, 512, 512] + - [170, 4700.0] + - - [512, 465, 1, 512, 512, 512, 512, 512] + - [170, 4762.0] + - - [512, 465, 1, 2048, 512, 512, 512, 2048] + - [130, 4906.0] + - - [512, 468, 1, 512, 512, 512, 512, 512] + - [181, 4497.0] + - - [512, 468, 1, 2048, 512, 512, 512, 2048] + - [130, 4901.0] + - - [512, 476, 1, 512, 512, 512, 512, 512] + - [156, 4537.0] + - - [512, 493, 1, 512, 512, 512, 512, 512] + - [156, 4251.0] + - - [512, 493, 1, 2048, 512, 512, 512, 2048] + - [144, 4562.0] + - - [512, 495, 1, 2048, 512, 512, 512, 2048] + - [179, 5392.0] + - - [512, 511, 1, 2048, 512, 512, 512, 2048] + - [130, 4708.0] + - - [512, 512, 1, 2048, 512, 512, 512, 2048] + - [130, 4713.0] + - - [64, 59, 512, 59, 64, 64, 64, 59] + - [172, 3626.0] + - - [64, 59, 544, 59, 64, 64, 64, 59] + - [172, 3771.0] + - - [256, 1024, 1, 1, 256, 256, 256, 1] + - [131, 56.0] + - - [257, 1024, 1, 4096, 257, 257, 257, 4096] + - [170, 4344.0] + - - [512, 215, 1, 2048, 512, 512, 512, 2048] + - [130, 4322.0] + - - [512, 256, 1, 2048, 512, 512, 512, 2048] + - [130, 4197.0] + - - [560, 200, 1, 1024, 560, 560, 560, 1024] + - [130, 3305.0] + - - [768, 215, 1, 2048, 768, 768, 768, 2048] + - [193, 4074.0] + - - [768, 256, 1, 2048, 768, 768, 768, 2048] + - [130, 4653.0] + - - [32, 33, 1600, 33, 32, 32, 32, 33] + - [168, 1987.0] + - - [512, 512, 1, 64, 512, 512, 512, 64] + - [156, 2261.0] + - - [1225, 32, 64, 192, 1225, 1225, 1225, 192] + - [181, 5358.0] + - - [1225, 48, 64, 192, 1225, 1225, 1225, 192] + - [154, 4223.0] + - - [1225, 48, 64, 256, 1225, 1225, 1225, 256] + - [172, 4133.0] + - - [1225, 48, 64, 288, 1225, 1225, 1225, 288] + - [132, 4142.0] + - - [49, 2048, 64, 512, 49, 49, 49, 512] + - [195, 4414.0] + - - [49, 512, 64, 2048, 49, 49, 49, 2048] + - [193, 4280.0] + - - [1225, 48, 32, 192, 1225, 1225, 1225, 192] + - [132, 4049.0] + - - [1225, 48, 32, 256, 1225, 1225, 1225, 256] + - [128, 4908.0] + - - [49, 2048, 32, 512, 49, 49, 49, 512] + - [170, 4349.0] + - - [49, 512, 32, 2048, 49, 49, 49, 2048] + - [170, 4159.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [172, 4139.0] + - - [100, 128, 18, 512, 100, 100, 100, 512] + - [130, 3682.0] + - - [100, 128, 19, 512, 100, 100, 100, 512] + - [172, 3796.0] + - - [1444, 128, 1, 576, 1444, 1444, 1444, 576] + - [130, 4082.0] + - - [361, 512, 1, 2304, 361, 361, 361, 2304] + - [156, 4540.0] + - - [480, 512, 1, 512, 480, 480, 480, 512] + - [144, 4107.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [156, 5041.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4917.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 4255.0] + - - [64, 32, 4608, 32, 64, 64, 64, 32] + - [161, 4043.0] + - - [64, 34, 4736, 34, 64, 64, 64, 34] + - [195, 2865.0] + - - [64, 35, 4608, 32, 64, 64, 64, 32] + - [132, 3082.0] + - - [64, 35, 4608, 35, 64, 64, 64, 35] + - [182, 2971.0] + - - [256, 864, 1, 128, 256, 256, 256, 128] + - [181, 3976.0] + - - [49, 2048, 64, 1024, 49, 49, 49, 1024] + - [193, 4424.0] + - - [49, 1024, 64, 2048, 49, 49, 49, 2048] + - [170, 4383.0] + - - [49, 2048, 32, 1024, 49, 49, 49, 1024] + - [144, 4379.0] + - - [49, 1024, 32, 2048, 49, 49, 49, 2048] + - [170, 4295.0] + - - [3136, 64, 1, 576, 3136, 3136, 3136, 576] + - [156, 4613.0] + - - [784, 128, 1, 1152, 784, 784, 784, 1152] + - [130, 4114.0] + - - [49, 2048, 128, 512, 49, 49, 49, 512] + - [195, 4453.0] + - - [49, 2048, 256, 512, 49, 49, 49, 512] + - [146, 4474.0] + - - [49, 512, 128, 2048, 49, 49, 49, 2048] + - [144, 4363.0] + - - [49, 512, 256, 2048, 49, 49, 49, 2048] + - [144, 4409.0] + - - [1024, 128, 1, 2, 1024, 1024, 1024, 2] + - [131, 134.0] + - - [1024, 96, 1, 2, 1024, 1024, 1024, 2] + - [131, 114.0] + - - [1909283, 40, 1, 40, 1909283, 1909283, 1909283, 40] + - [146, 3014.0] + - - [3818566, 40, 1, 40, 3818566, 3818566, 3818566, 40] + - [158, 2871.0] + - - [2560, 35, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 2925.0] + - - [2560, 36, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3024.0] + - - [2560, 39, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3275.0] + - - [2560, 40, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3342.0] + - - [2560, 42, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3518.0] + - - [2560, 43, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3614.0] + - - [2560, 44, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3682.0] + - - [2560, 46, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 3850.0] + - - [2560, 48, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4025.0] + - - [2560, 49, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4103.0] + - - [2560, 50, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4179.0] + - - [2560, 51, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4262.0] + - - [2560, 53, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4418.0] + - - [2560, 54, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4496.0] + - - [2560, 55, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4592.0] + - - [2560, 56, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4664.0] + - - [2560, 57, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4739.0] + - - [2560, 58, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4851.0] + - - [2560, 59, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 4903.0] + - - [2560, 61, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 5081.0] + - - [2560, 63, 1, 29000, 2560, 2560, 2560, 29000] + - [130, 5249.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [210, 3208.0] + - - [3584, 4, 1, 1280, 3584, 3584, 3584, 1280] + - [252, 1058.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [238, 2987.0] + - - [2944, 4, 1, 256, 2944, 2944, 2944, 256] + - [258, 395.0] + - - [5056, 4, 1, 3328, 5056, 5056, 5056, 3328] + - [214, 1742.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [148, 2228.0] + - - [2368, 4, 1, 1280, 2368, 2368, 2368, 1280] + - [251, 708.0] + - - [6784, 4, 1, 1280, 6784, 6784, 6784, 1280] + - [218, 1497.0] + - - [8448, 4, 1, 2816, 8448, 8448, 8448, 2816] + - [212, 1184.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1856, 1280] + - [223, 640.0] + - - [4608, 1, 1, 1536, 4608, 4608, 4608, 1536] + - [214, 340.0] + - - [7680, 4, 1, 2560, 7680, 7680, 7680, 2560] + - [246, 948.0] + - - [8448, 16, 1, 2816, 8448, 8448, 8448, 2816] + - [225, 3488.0] + - - [3072, 2, 1, 1024, 3072, 3072, 3072, 1024] + - [232, 436.0] + - - [2368, 4, 1, 256, 2368, 2368, 2368, 256] + - [218, 475.0] + - - [7680, 1, 1, 2560, 7680, 7680, 7680, 2560] + - [245, 249.0] + - - [4608, 2, 1, 1536, 4608, 4608, 4608, 1536] + - [260, 590.0] + - - [4608, 4, 1, 1536, 4608, 4608, 4608, 1536] + - [250, 1251.0] + - - [3072, 1, 1, 128, 3072, 3072, 3072, 128] + - [221, 64.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [210, 3071.0] + - - [4288, 4, 1, 256, 4288, 4288, 4288, 256] + - [258, 546.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3584, 3328] + - [257, 1225.0] + - - [5888, 4, 1, 1280, 5888, 5888, 5888, 1280] + - [207, 1299.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [229, 2139.0] + - - [5888, 4, 1, 128, 5888, 5888, 5888, 128] + - [243, 467.0] + - - [8448, 1, 1, 2816, 8448, 8448, 8448, 2816] + - [257, 280.0] + - - [1408, 4, 1, 256, 1408, 1408, 1408, 256] + - [148, 275.0] + - - [6144, 4, 1, 2560, 6144, 6144, 6144, 2560] + - [207, 1712.0] + - - [3072, 1, 1, 1024, 3072, 3072, 3072, 1024] + - [252, 194.0] + - - [5056, 4, 1, 1280, 5056, 5056, 5056, 1280] + - [214, 1269.0] + - - [3072, 16, 1, 1024, 3072, 3072, 3072, 1024] + - [210, 2427.0] + - - [1408, 4, 1, 3328, 1408, 1408, 1408, 3328] + - [223, 526.0] + - - [6144, 1, 1, 2560, 6144, 6144, 6144, 2560] + - [257, 423.0] + - - [6144, 16, 1, 2560, 6144, 6144, 6144, 2560] + - [244, 3622.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [259, 3027.0] + - - [1408, 4, 1, 128, 1408, 1408, 1408, 128] + - [248, 193.0] + - - [1856, 4, 1, 256, 1856, 1856, 1856, 256] + - [226, 359.0] + - - [6784, 4, 1, 128, 6784, 6784, 6784, 128] + - [206, 755.0] + - - [2944, 4, 1, 128, 2944, 2944, 2944, 128] + - [217, 250.0] + - - [5888, 4, 1, 3328, 5888, 5888, 5888, 3328] + - [250, 890.0] + - - [5056, 4, 1, 128, 5056, 5056, 5056, 128] + - [243, 406.0] + - - [3072, 4, 1, 1024, 3072, 3072, 3072, 1024] + - [213, 812.0] + - - [2944, 4, 1, 3328, 2944, 2944, 2944, 3328] + - [250, 1058.0] + - - [2368, 4, 1, 128, 2368, 2368, 2368, 128] + - [164, 277.0] + - - [1856, 4, 1, 128, 1856, 1856, 1856, 128] + - [248, 162.0] + - - [7680, 2, 1, 2560, 7680, 7680, 7680, 2560] + - [246, 475.0] + - - [7680, 16, 1, 2560, 7680, 7680, 7680, 2560] + - [244, 2102.0] + - - [4224, 1, 1, 128, 4224, 4224, 4224, 128] + - [234, 97.0] + - - [8448, 2, 1, 2816, 8448, 8448, 8448, 2816] + - [218, 574.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1408, 1280] + - [223, 485.0] + - - [6784, 4, 1, 256, 6784, 6784, 6784, 256] + - [207, 1075.0] + - - [4288, 4, 1, 128, 4288, 4288, 4288, 128] + - [218, 352.0] + - - [1856, 4, 1, 3328, 1856, 1856, 1856, 3328] + - [251, 681.0] + - - [3584, 4, 1, 256, 3584, 3584, 3584, 256] + - [258, 461.0] + - - [2368, 4, 1, 3328, 2368, 2368, 2368, 3328] + - [215, 937.0] + - - [6784, 4, 1, 3328, 6784, 6784, 6784, 3328] + - [219, 609.0] + - - [4288, 4, 1, 1280, 4288, 4288, 4288, 1280] + - [250, 1026.0] + - - [3584, 4, 1, 128, 3584, 3584, 3584, 128] + - [247, 292.0] + - - [5056, 4, 1, 256, 5056, 5056, 5056, 256] + - [217, 616.0] + - - [4288, 4, 1, 3328, 4288, 4288, 4288, 3328] + - [233, 1500.0] + - - [4608, 16, 1, 1536, 4608, 4608, 4608, 1536] + - [240, 3307.0] + - - [6144, 2, 1, 2560, 6144, 6144, 6144, 2560] + - [245, 837.0] + - - [2944, 4, 1, 1280, 2944, 2944, 2944, 1280] + - [242, 793.0] + - - [5888, 4, 1, 256, 5888, 5888, 5888, 256] + - [256, 743.0] + - - [4096, 29, 1, 2048, 4096, 4096, 4096, 2048] + - [240, 3016.0] + - - [4096, 25, 1, 2048, 4096, 4096, 4096, 2048] + - [259, 2680.0] + - - [4096, 29, 1, 3072, 4096, 4096, 4096, 3072] + - [249, 3080.0] + - - [4096, 24, 1, 2048, 4096, 4096, 4096, 2048] + - [255, 4061.0] + - - [36548, 1, 1, 1024, 36548, 36548, 36548, 1024] + - [243, 204.0] + - - [4096, 27, 1, 2048, 4096, 4096, 4096, 2048] + - [230, 2897.0] + - - [4096, 1, 1, 2048, 4096, 4096, 4096, 2048] + - [253, 353.0] + - - [4096, 24, 1, 3072, 4096, 4096, 4096, 3072] + - [249, 3563.0] + - - [4096, 27, 1, 3072, 4096, 4096, 4096, 3072] + - [240, 2898.0] + - - [36548, 25, 1, 1024, 36548, 36548, 36548, 1024] + - [244, 3259.0] + - - [4096, 1, 1, 3072, 4096, 4096, 4096, 3072] + - [214, 351.0] + - - [4096, 25, 1, 3072, 4096, 4096, 4096, 3072] + - [230, 2720.0] + - - [36548, 24, 1, 1024, 36548, 36548, 36548, 1024] + - [249, 3249.0] + - - [6272, 16, 1, 480, 6272, 6272, 6272, 480] + - [225, 3354.0] + - - [1568, 32, 1, 832, 1568, 1568, 1568, 832] + - [211, 2787.0] + - - [1568, 48, 1, 832, 1568, 1568, 1568, 832] + - [205, 3282.0] + - - [6272, 24, 1, 512, 6272, 6272, 6272, 512] + - [212, 3416.0] + - - [2048, 1, 1, 512, 2048, 2048, 2048, 512] + - [209, 130.0] + - - [2048, 2, 1, 2, 2048, 2048, 2048, 2] + - [203, 5.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [251, 349.0] + - - [2560, 4, 1, 2, 2560, 2560, 2560, 2] + - [243, 13.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [231, 890.0] + - - [12288, 12, 2, 256, 12288, 12288, 12288, 256] + - [236, 2768.0] + - - [12288, 3, 2, 256, 12288, 12288, 12288, 256] + - [222, 866.0] + - - [51520, 12, 2, 256, 51520, 51520, 51520, 256] + - [236, 2972.0] + - - [51520, 3, 2, 256, 51520, 51520, 51520, 256] + - [235, 1001.0] + - - [15200, 12, 2, 256, 15200, 15200, 15200, 256] + - [236, 2976.0] + - - [15200, 3, 2, 256, 15200, 15200, 15200, 256] + - [245, 1286.0] + - - [3456, 3, 2, 256, 3456, 3456, 3456, 256] + - [218, 795.0] + - - [13600, 12, 2, 256, 13600, 13600, 13600, 256] + - [228, 2756.0] + - - [12880, 3, 2, 256, 12880, 12880, 12880, 256] + - [237, 981.0] + - - [3400, 3, 2, 256, 3400, 3400, 3400, 256] + - [208, 715.0] + - - [12880, 12, 2, 256, 12880, 12880, 12880, 256] + - [220, 2723.0] + - - [13824, 12, 2, 256, 13824, 13824, 13824, 256] + - [220, 2805.0] + - - [13824, 3, 2, 256, 13824, 13824, 13824, 256] + - [239, 1042.0] + - - [13600, 3, 2, 256, 13600, 13600, 13600, 256] + - [227, 959.0] + - - [3456, 12, 2, 256, 3456, 3456, 3456, 256] + - [203, 1746.0] + - - [3800, 3, 2, 256, 3800, 3800, 3800, 256] + - [207, 600.0] + - - [3400, 12, 2, 256, 3400, 3400, 3400, 256] + - [205, 1741.0] + - - [3800, 12, 2, 256, 3800, 3800, 3800, 256] + - [204, 2041.0] + - - [55296, 3, 2, 256, 55296, 55296, 55296, 256] + - [216, 739.0] + - - [3220, 3, 2, 256, 3220, 3220, 3220, 256] + - [218, 788.0] + - - [3072, 3, 2, 256, 3072, 3072, 3072, 256] + - [207, 814.0] + - - [3220, 12, 2, 256, 3220, 3220, 3220, 256] + - [204, 2069.0] + - - [3072, 12, 2, 256, 3072, 3072, 3072, 256] + - [244, 2307.0] + - - [54400, 3, 2, 256, 54400, 54400, 54400, 256] + - [255, 768.0] + - - [60800, 12, 2, 256, 60800, 60800, 60800, 256] + - [220, 2589.0] + - - [60800, 3, 2, 256, 60800, 60800, 60800, 256] + - [216, 612.0] + - - [1909283, 11, 1, 11, 1909283, 1909283, 1909283, 11] + - [254, 1147.0] + - - [3818566, 11, 1, 11, 3818566, 3818566, 3818566, 11] + - [254, 966.0] + - - [2048, 8, 1, 2, 2048, 2048, 2048, 2] + - [224, 21.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [215, 1498.0] + - - [2560, 2, 1, 2, 2560, 2560, 2560, 2] + - [134, 7.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [215, 483.0] + - - [2560, 27, 1, 29000, 2560, 2560, 2560, 29000] + - [241, 2934.0] + - - [4, 1856, 1, 3328, 4, 4, 4, 3328] + - [173, 488.0] + - - [35, 1500, 1, 2560, 35, 35, 35, 2560] + - [266, 1687.0] + - - [4, 2368, 1, 1280, 4, 4, 4, 1280] + - [264, 487.0] + - - [4, 3584, 1, 128, 4, 4, 4, 128] + - [275, 417.0] + - - [4, 1408, 1, 3328, 4, 4, 4, 3328] + - [159, 395.0] + - - [4, 6784, 1, 3328, 4, 4, 4, 3328] + - [147, 424.0] + - - [4, 4288, 1, 128, 4, 4, 4, 128] + - [264, 416.0] + - - [4, 6784, 1, 1280, 4, 4, 4, 1280] + - [264, 580.0] + - - [4, 5056, 1, 256, 4, 4, 4, 256] + - [275, 525.0] + - - [4, 2944, 1, 3328, 4, 4, 4, 3328] + - [167, 538.0] + - - [4, 5056, 1, 1280, 4, 4, 4, 1280] + - [275, 630.0] + - - [35, 1500, 1, 2048, 35, 35, 35, 2048] + - [265, 2031.0] + - - [4, 2368, 1, 3328, 4, 4, 4, 3328] + - [190, 548.0] + - - [4, 1856, 1, 256, 4, 4, 4, 256] + - [190, 246.0] + - - [4, 2944, 1, 256, 4, 4, 4, 256] + - [270, 320.0] + - - [4, 6784, 1, 128, 4, 4, 4, 128] + - [270, 499.0] + - - [4, 3584, 1, 1280, 4, 4, 4, 1280] + - [275, 547.0] + - - [4, 5888, 1, 256, 4, 4, 4, 256] + - [275, 428.0] + - - [4, 5888, 1, 3328, 4, 4, 4, 3328] + - [264, 472.0] + - - [4, 6784, 1, 256, 4, 4, 4, 256] + - [275, 416.0] + - - [4, 1408, 1, 1280, 4, 4, 4, 1280] + - [190, 364.0] + - - [4, 3584, 1, 256, 4, 4, 4, 256] + - [270, 368.0] + - - [4, 2944, 1, 1280, 4, 4, 4, 1280] + - [167, 499.0] + - - [4, 1408, 1, 256, 4, 4, 4, 256] + - [147, 192.0] + - - [4, 4288, 1, 3328, 4, 4, 4, 3328] + - [270, 617.0] + - - [4, 2368, 1, 128, 4, 4, 4, 128] + - [264, 201.0] + - - [4, 5888, 1, 1280, 4, 4, 4, 1280] + - [264, 586.0] + - - [4, 1856, 1, 1280, 4, 4, 4, 1280] + - [173, 419.0] + - - [4, 1856, 1, 128, 4, 4, 4, 128] + - [264, 166.0] + - - [4, 2944, 1, 128, 4, 4, 4, 128] + - [264, 238.0] + - - [4, 4288, 1, 1280, 4, 4, 4, 1280] + - [264, 560.0] + - - [4, 5056, 1, 3328, 4, 4, 4, 3328] + - [275, 657.0] + - - [4, 5056, 1, 128, 4, 4, 4, 128] + - [275, 353.0] + - - [4, 4288, 1, 256, 4, 4, 4, 256] + - [270, 387.0] + - - [4, 3584, 1, 3328, 4, 4, 4, 3328] + - [270, 607.0] + - - [4, 2368, 1, 256, 4, 4, 4, 256] + - [264, 289.0] + - - [4, 5888, 1, 128, 4, 4, 4, 128] + - [264, 366.0] + - - [4, 1408, 1, 128, 4, 4, 4, 128] + - [167, 123.0] + - - [16, 2000, 1, 2048, 16, 16, 16, 2048] + - [173, 1628.0] + - - [2, 2048, 1, 2000, 2, 2, 2, 2000] + - [173, 244.0] + - - [32, 2000, 1, 2048, 32, 32, 32, 2048] + - [143, 2456.0] + - - [10, 2000, 1, 1024, 10, 10, 10, 1024] + - [167, 961.0] + - - [2, 2000, 1, 100, 2, 2, 2, 100] + - [141, 70.0] + - - [10, 2000, 1, 512, 10, 10, 10, 512] + - [147, 868.0] + - - [32, 2000, 1, 500, 32, 32, 32, 500] + - [267, 2049.0] + - - [32, 2000, 1, 1024, 32, 32, 32, 1024] + - [169, 2433.0] + - - [4, 2048, 1, 500, 4, 4, 4, 500] + - [183, 446.0] + - - [16, 2000, 1, 500, 16, 16, 16, 500] + - [173, 1361.0] + - - [4, 2048, 1, 100, 4, 4, 4, 100] + - [196, 142.0] + - - [16, 2000, 1, 100, 16, 16, 16, 100] + - [173, 559.0] + - - [4, 2000, 1, 10, 4, 4, 4, 10] + - [263, 20.0] + - - [10, 2000, 1, 10, 10, 10, 10, 10] + - [153, 51.0] + - - [2, 2048, 1, 512, 2, 2, 2, 512] + - [275, 169.0] + - - [10, 2048, 1, 100, 10, 10, 10, 100] + - [167, 354.0] + - - [8, 2048, 1, 100, 8, 8, 8, 100] + - [153, 297.0] + - - [2, 2048, 1, 1024, 2, 2, 2, 1024] + - [270, 201.0] + - - [16, 2000, 1, 1024, 16, 16, 16, 1024] + - [147, 1698.0] + - - [10, 2000, 1, 2000, 10, 10, 10, 2000] + - [183, 1186.0] + - - [8, 2000, 1, 500, 8, 8, 8, 500] + - [133, 693.0] + - - [16, 2000, 1, 2000, 16, 16, 16, 2000] + - [147, 1893.0] + - - [10, 2048, 1, 2048, 10, 10, 10, 2048] + - [173, 1226.0] + - - [8, 2000, 1, 512, 8, 8, 8, 512] + - [173, 699.0] + - - [2, 2000, 1, 2048, 2, 2, 2, 2048] + - [147, 239.0] + - - [16, 2048, 1, 500, 16, 16, 16, 500] + - [183, 1422.0] + - - [8, 2048, 1, 1024, 8, 8, 8, 1024] + - [147, 860.0] + - - [2, 2000, 1, 500, 2, 2, 2, 500] + - [141, 163.0] + - - [32, 2048, 1, 100, 32, 32, 32, 100] + - [274, 1622.0] + - - [10, 2048, 1, 500, 10, 10, 10, 500] + - [167, 813.0] + - - [4, 2000, 1, 2048, 4, 4, 4, 2048] + - [272, 418.0] + - - [8, 2000, 1, 1024, 8, 8, 8, 1024] + - [264, 832.0] + - - [32, 2048, 1, 512, 32, 32, 32, 512] + - [266, 2489.0] + - - [32, 2048, 1, 1024, 32, 32, 32, 1024] + - [277, 2390.0] + - - [32, 2048, 1, 500, 32, 32, 32, 500] + - [155, 2347.0] + - - [10, 2048, 1, 1024, 10, 10, 10, 1024] + - [276, 1084.0] + - - [8, 2048, 1, 2048, 8, 8, 8, 2048] + - [141, 868.0] + - - [16, 2048, 1, 2048, 16, 16, 16, 2048] + - [147, 1974.0] + - - [8, 2000, 1, 10, 8, 8, 8, 10] + - [196, 42.0] + - - [4, 2000, 1, 2000, 4, 4, 4, 2000] + - [269, 425.0] + - - [8, 2048, 1, 512, 8, 8, 8, 512] + - [264, 719.0] + - - [8, 2000, 1, 2048, 8, 8, 8, 2048] + - [268, 842.0] + - - [32, 2048, 1, 2000, 32, 32, 32, 2000] + - [276, 2959.0] + - - [16, 2000, 1, 10, 16, 16, 16, 10] + - [153, 82.0] + - - [8, 2048, 1, 2000, 8, 8, 8, 2000] + - [269, 879.0] + - - [4, 2048, 1, 2048, 4, 4, 4, 2048] + - [278, 427.0] + - - [10, 2048, 1, 2000, 10, 10, 10, 2000] + - [178, 1064.0] + - - [8, 2000, 1, 100, 8, 8, 8, 100] + - [169, 412.0] + - - [2, 2000, 1, 2000, 2, 2, 2, 2000] + - [273, 224.0] + - - [16, 2048, 1, 1024, 16, 16, 16, 1024] + - [167, 1567.0] + - - [32, 2000, 1, 2000, 32, 32, 32, 2000] + - [262, 2546.0] + - - [32, 2048, 1, 2048, 32, 32, 32, 2048] + - [277, 2538.0] + - - [2, 2048, 1, 10, 2, 2, 2, 10] + - [261, 10.0] + - - [4, 2048, 1, 512, 4, 4, 4, 512] + - [167, 354.0] + - - [4, 2048, 1, 10, 4, 4, 4, 10] + - [173, 22.0] + - - [16, 2048, 1, 100, 16, 16, 16, 100] + - [167, 579.0] + - - [4, 2000, 1, 500, 4, 4, 4, 500] + - [190, 321.0] + - - [10, 2000, 1, 500, 10, 10, 10, 500] + - [183, 858.0] + - - [32, 2000, 1, 512, 32, 32, 32, 512] + - [143, 2043.0] + - - [2, 2000, 1, 1024, 2, 2, 2, 1024] + - [264, 210.0] + - - [2, 2000, 1, 512, 2, 2, 2, 512] + - [270, 172.0] + - - [4, 2048, 1, 1024, 4, 4, 4, 1024] + - [196, 435.0] + - - [8, 2048, 1, 500, 8, 8, 8, 500] + - [183, 692.0] + - - [4, 2048, 1, 2000, 4, 4, 4, 2000] + - [173, 488.0] + - - [8, 2000, 1, 2000, 8, 8, 8, 2000] + - [159, 950.0] + - - [4, 2000, 1, 1024, 4, 4, 4, 1024] + - [190, 435.0] + - - [32, 2000, 1, 100, 32, 32, 32, 100] + - [131, 1509.0] + - - [2, 2048, 1, 100, 2, 2, 2, 100] + - [141, 72.0] + - - [8, 2048, 1, 10, 8, 8, 8, 10] + - [153, 40.0] + - - [2, 2048, 1, 2048, 2, 2, 2, 2048] + - [275, 219.0] + - - [10, 2000, 1, 2048, 10, 10, 10, 2048] + - [173, 1192.0] + - - [16, 2048, 1, 2000, 16, 16, 16, 2000] + - [127, 1772.0] + - - [10, 2048, 1, 512, 10, 10, 10, 512] + - [141, 869.0] + - - [16, 2048, 1, 512, 16, 16, 16, 512] + - [167, 1387.0] + - - [2, 2000, 1, 10, 2, 2, 2, 10] + - [129, 10.0] + - - [4, 2000, 1, 100, 4, 4, 4, 100] + - [271, 140.0] + - - [16, 2000, 1, 512, 16, 16, 16, 512] + - [147, 1412.0] + - - [32, 2048, 1, 10, 32, 32, 32, 10] + - [155, 155.0] + - - [10, 2048, 1, 10, 10, 10, 10, 10] + - [133, 52.0] + - - [4, 2000, 1, 512, 4, 4, 4, 512] + - [275, 335.0] + - - [16, 2048, 1, 10, 16, 16, 16, 10] + - [127, 81.0] + - - [32, 2000, 1, 10, 32, 32, 32, 10] + - [145, 327.0] + - - [10, 2000, 1, 100, 10, 10, 10, 100] + - [167, 345.0] + - - [2, 2048, 1, 500, 2, 2, 2, 500] + - [261, 189.0] + - - [1024, 1, 1, 500000, 1024, 1024, 1024, 500000] + - [279, 152.0] + - - [1024, 16, 1, 500000, 1024, 1024, 1024, 500000] + - [284, 2383.0] + - - [1024, 2, 1, 500000, 1024, 1024, 1024, 500000] + - [279, 303.0] + - - [512, 1, 1, 500000, 512, 512, 512, 500000] + - [283, 122.0] + - - [1024, 8, 1, 500000, 1024, 1024, 1024, 500000] + - [284, 1201.0] + - - [1024, 4, 1, 500000, 1024, 1024, 1024, 500000] + - [281, 609.0] + - - [512, 16, 1, 500000, 512, 512, 512, 500000] + - [285, 1910.0] + - - [512, 2, 1, 500000, 512, 512, 512, 500000] + - [286, 244.0] + - - [512, 8, 1, 500000, 512, 512, 512, 500000] + - [283, 960.0] + - - [512, 4, 1, 500000, 512, 512, 512, 500000] + - [286, 484.0] + - - [1024, 20, 1, 30522, 1024, 1024, 1024, 30522] + - [280, 2608.0] + - - [49, 512, 1, 4608, 49, 49, 49, 4608] + - [282, 2706.0] + - - [64, 512, 1, 1, 64, 64, 64, 1] + - [134, 18.0] + - - [1024, 32, 1, 2, 1024, 1024, 1024, 2] + - [134, 44.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [148, 1997.0] + - - [768, 32, 1, 768, 768, 768, 768, 768] + - [174, 1430.0] + - - [768, 32, 1, 2, 768, 768, 768, 2] + - [131, 13.0] + - - [768, 64, 1, 768, 768, 768, 768, 768] + - [169, 2195.0] + - - [768, 64, 1, 2, 768, 768, 768, 2] + - [174, 26.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 1148.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [156, 3009.0] + - - [32, 200, 1, 1, 32, 32, 32, 1] + - [127, 2.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 277.0] + - - [1024, 4, 1, 2, 1024, 1024, 1024, 2] + - [127, 2.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [152, 766.0] + - - [768, 16, 1, 2, 768, 768, 768, 2] + - [127, 6.0] + - - [768, 8, 1, 768, 768, 768, 768, 768] + - [164, 376.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 414.0] + - - [1024, 6, 1, 2, 1024, 1024, 1024, 2] + - [127, 3.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 553.0] + - - [4, 704, 1, 1280, 4, 4, 4, 1280] + - [173, 195.0] + - - [512, 4, 1, 512, 512, 512, 512, 512] + - [184, 110.0] + - - [64, 4, 1, 256, 64, 64, 64, 256] + - [133, 9.0] + - - [64, 704, 1, 128, 64, 64, 64, 128] + - [201, 1109.0] + - - [448, 64, 1, 1280, 448, 448, 448, 1280] + - [148, 1936.0] + - - [128, 4, 1, 1280, 128, 128, 128, 1280] + - [148, 39.0] + - - [128, 256, 1, 256, 128, 128, 128, 256] + - [192, 1116.0] + - - [64, 1024, 1, 1280, 64, 64, 64, 1280] + - [144, 2981.0] + - - [64, 704, 1, 1280, 64, 64, 64, 1280] + - [192, 2225.0] + - - [64, 64, 1, 1280, 64, 64, 64, 1280] + - [139, 314.0] + - - [1024, 64, 1, 128, 1024, 1024, 1024, 128] + - [180, 1237.0] + - - [64, 1024, 1, 3328, 64, 64, 64, 3328] + - [135, 3211.0] + - - [128, 1, 1, 1408, 128, 128, 128, 1408] + - [134, 10.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1024, 1280] + - [135, 2789.0] + - - [704, 4, 1, 1280, 704, 704, 704, 1280] + - [188, 200.0] + - - [64, 256, 1, 128, 64, 64, 64, 128] + - [143, 376.0] + - - [256, 256, 1, 3328, 256, 256, 256, 3328] + - [175, 3094.0] + - - [64, 1024, 1, 128, 64, 64, 64, 128] + - [190, 1601.0] + - - [128, 256, 1, 3328, 128, 128, 128, 3328] + - [160, 2222.0] + - - [64, 448, 1, 1280, 64, 64, 64, 1280] + - [184, 2227.0] + - - [448, 4, 1, 256, 448, 448, 448, 256] + - [147, 96.0] + - - [256, 4, 1, 1280, 256, 256, 256, 1280] + - [152, 89.0] + - - [512, 32, 1, 512, 512, 512, 512, 512] + - [159, 1116.0] + - - [64, 64, 1, 3328, 64, 64, 64, 3328] + - [188, 412.0] + - - [512, 1, 1, 512, 512, 512, 512, 512] + - [147, 36.0] + - - [704, 64, 1, 3328, 704, 704, 704, 3328] + - [155, 2460.0] + - - [256, 4, 1, 256, 256, 256, 256, 256] + - [134, 56.0] + - - [256, 64, 1, 1280, 256, 256, 256, 1280] + - [197, 1400.0] + - - [1024, 4, 1, 256, 1024, 1024, 1024, 256] + - [160, 221.0] + - - [4, 704, 1, 256, 4, 4, 4, 256] + - [197, 151.0] + - - [704, 64, 1, 1280, 704, 704, 704, 1280] + - [134, 2344.0] + - - [128, 448, 1, 256, 128, 128, 128, 256] + - [144, 2258.0] + - - [128, 256, 1, 1280, 128, 128, 128, 1280] + - [160, 2116.0] + - - [448, 64, 1, 3328, 448, 448, 448, 3328] + - [160, 2328.0] + - - [256, 128, 1, 128, 256, 256, 256, 128] + - [136, 1146.0] + - - [4, 448, 1, 128, 4, 4, 4, 128] + - [134, 64.0] + - - [64, 128, 1, 3328, 64, 64, 64, 3328] + - [139, 819.0] + - - [128, 128, 1, 3328, 128, 128, 128, 3328] + - [134, 1507.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [184, 1601.0] + - - [64, 1, 1, 1216, 64, 64, 64, 1216] + - [164, 6.0] + - - [1024, 4, 1, 3328, 1024, 1024, 1024, 3328] + - [139, 368.0] + - - [4, 4, 1, 256, 4, 4, 4, 256] + - [130, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [148, 945.0] + - - [256, 128, 1, 1280, 256, 256, 256, 1280] + - [184, 2110.0] + - - [128, 64, 1, 1280, 128, 128, 128, 1280] + - [164, 726.0] + - - [4, 448, 1, 3328, 4, 4, 4, 3328] + - [184, 166.0] + - - [64, 1024, 1, 256, 64, 64, 64, 256] + - [169, 2249.0] + - - [64, 704, 1, 256, 64, 64, 64, 256] + - [192, 1393.0] + - - [704, 64, 1, 128, 704, 704, 704, 128] + - [169, 971.0] + - - [448, 4, 1, 1280, 448, 448, 448, 1280] + - [188, 135.0] + - - [1024, 2, 1, 512, 1024, 1024, 1024, 512] + - [134, 105.0] + - - [256, 64, 1, 3328, 256, 256, 256, 3328] + - [184, 1418.0] + - - [448, 128, 1, 256, 448, 448, 448, 256] + - [129, 1727.0] + - - [448, 64, 1, 128, 448, 448, 448, 128] + - [192, 658.0] + - - [4, 448, 1, 256, 4, 4, 4, 256] + - [167, 65.0] + - - [64, 704, 1, 3328, 64, 64, 64, 3328] + - [143, 2446.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [144, 1812.0] + - - [4, 1024, 1, 3328, 4, 4, 4, 3328] + - [147, 322.0] + - - [4, 704, 1, 128, 4, 4, 4, 128] + - [192, 64.0] + - - [64, 128, 1, 128, 64, 64, 64, 128] + - [174, 191.0] + - - [704, 4, 1, 128, 704, 704, 704, 128] + - [147, 63.0] + - - [64, 448, 1, 3328, 64, 64, 64, 3328] + - [134, 2260.0] + - - [448, 4, 1, 3328, 448, 448, 448, 3328] + - [177, 177.0] + - - [256, 4, 1, 3328, 256, 256, 256, 3328] + - [163, 95.0] + - - [4, 256, 1, 256, 4, 4, 4, 256] + - [148, 56.0] + - - [4, 64, 1, 1280, 4, 4, 4, 1280] + - [138, 19.0] + - - [4, 4, 1, 128, 4, 4, 4, 128] + - [199, 0.36] + - - [4, 128, 1, 256, 4, 4, 4, 256] + - [166, 21.0] + - - [448, 128, 1, 3328, 448, 448, 448, 3328] + - [149, 2724.0] + - - [64, 448, 1, 256, 64, 64, 64, 256] + - [148, 1028.0] + - - [64, 256, 1, 1280, 64, 64, 64, 1280] + - [148, 1216.0] + - - [1024, 32, 1, 512, 1024, 1024, 1024, 512] + - [184, 1522.0] + - - [64, 4, 1, 128, 64, 64, 64, 128] + - [160, 6.0] + - - [256, 64, 1, 128, 256, 256, 256, 128] + - [174, 394.0] + - - [64, 64, 1, 256, 64, 64, 64, 256] + - [180, 189.0] + - - [4, 704, 1, 3328, 4, 4, 4, 3328] + - [200, 226.0] + - - [4, 4, 1, 1280, 4, 4, 4, 1280] + - [127, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [173, 383.0] + - - [1024, 4, 1, 128, 1024, 1024, 1024, 128] + - [134, 91.0] + - - [4, 64, 1, 128, 4, 4, 4, 128] + - [141, 6.0] + - - [64, 128, 1, 1280, 64, 64, 64, 1280] + - [139, 629.0] + - - [128, 128, 1, 1280, 128, 128, 128, 1280] + - [174, 1231.0] + - - [512, 2, 1, 512, 512, 512, 512, 512] + - [133, 54.0] + - - [64, 128, 1, 256, 64, 64, 64, 256] + - [148, 319.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1024, 1280] + - [139, 295.0] + - - [35, 700, 1, 2048, 35, 35, 35, 2048] + - [143, 1254.0] + - - [704, 64, 1, 256, 704, 704, 704, 256] + - [127, 1775.0] + - - [128, 448, 1, 1280, 128, 128, 128, 1280] + - [191, 2723.0] + - - [128, 64, 1, 3328, 128, 128, 128, 3328] + - [188, 766.0] + - - [448, 64, 1, 256, 448, 448, 448, 256] + - [192, 1055.0] + - - [1024, 16, 1, 512, 1024, 1024, 1024, 512] + - [139, 870.0] + - - [4, 256, 1, 128, 4, 4, 4, 128] + - [167, 24.0] + - - [512, 16, 1, 512, 512, 512, 512, 512] + - [148, 438.0] + - - [1024, 64, 1, 256, 1024, 1024, 1024, 256] + - [129, 1712.0] + - - [4, 4, 1, 3328, 4, 4, 4, 3328] + - [127, 1.0] + - - [4, 1024, 1, 1280, 4, 4, 4, 1280] + - [173, 277.0] + - - [704, 4, 1, 256, 704, 704, 704, 256] + - [184, 101.0] + - - [128, 64, 1, 256, 128, 128, 128, 256] + - [173, 314.0] + - - [128, 4, 1, 3328, 128, 128, 128, 3328] + - [139, 48.0] + - - [128, 4, 1, 128, 128, 128, 128, 128] + - [131, 11.0] + - - [128, 1, 1, 1024, 128, 128, 128, 1024] + - [133, 9.0] + - - [4, 128, 1, 3328, 4, 4, 4, 3328] + - [176, 48.0] + - - [256, 256, 1, 128, 256, 256, 256, 128] + - [169, 1252.0] + - - [704, 4, 1, 3328, 704, 704, 704, 3328] + - [139, 241.0] + - - [448, 128, 1, 1280, 448, 448, 448, 1280] + - [193, 2581.0] + - - [1024, 64, 1, 3328, 1024, 1024, 1024, 3328] + - [175, 3096.0] + - - [256, 4, 1, 128, 256, 256, 256, 128] + - [134, 23.0] + - - [4, 1024, 1, 128, 4, 4, 4, 128] + - [190, 93.0] + - - [64, 256, 1, 3328, 64, 64, 64, 3328] + - [160, 1418.0] + - - [448, 128, 1, 128, 448, 448, 448, 128] + - [192, 1195.0] + - - [128, 256, 1, 128, 128, 128, 128, 128] + - [184, 746.0] + - - [128, 4, 1, 256, 128, 128, 128, 256] + - [160, 19.0] + - - [256, 256, 1, 1280, 256, 256, 256, 1280] + - [193, 3089.0] + - - [256, 128, 1, 3328, 256, 256, 256, 3328] + - [197, 2122.0] + - - [4, 448, 1, 1280, 4, 4, 4, 1280] + - [148, 133.0] + - - [448, 4, 1, 128, 448, 448, 448, 128] + - [148, 40.0] + - - [4, 256, 1, 3328, 4, 4, 4, 3328] + - [164, 95.0] + - - [4, 128, 1, 128, 4, 4, 4, 128] + - [145, 12.0] + - - [4, 256, 1, 1280, 4, 4, 4, 1280] + - [148, 78.0] + - - [64, 4, 1, 3328, 64, 64, 64, 3328] + - [138, 24.0] + - - [4, 64, 1, 3328, 4, 4, 4, 3328] + - [138, 24.0] + - - [35, 700, 1, 2560, 35, 35, 35, 2560] + - [133, 1511.0] + - - [4, 1024, 1, 256, 4, 4, 4, 256] + - [167, 143.0] + - - [64, 256, 1, 256, 64, 64, 64, 256] + - [148, 613.0] + - - [1024, 4, 1, 512, 1024, 1024, 1024, 512] + - [174, 216.0] + - - [4, 64, 1, 256, 4, 4, 4, 256] + - [127, 11.0] + - - [128, 448, 1, 128, 128, 128, 128, 128] + - [155, 1161.0] + - - [64, 448, 1, 128, 64, 64, 64, 128] + - [134, 672.0] + - - [128, 448, 1, 3328, 128, 128, 128, 3328] + - [130, 2726.0] + - - [4, 128, 1, 1280, 4, 4, 4, 1280] + - [177, 44.0] + - - [128, 64, 1, 128, 128, 128, 128, 128] + - [174, 190.0] + - - [64, 64, 1, 128, 64, 64, 64, 128] + - [194, 99.0] + - - [64, 4, 1, 1280, 64, 64, 64, 1280] + - [133, 19.0] + - - [1024, 1, 1, 512, 1024, 1024, 1024, 512] + - [160, 53.0] + - - [128, 128, 1, 256, 128, 128, 128, 256] + - [148, 622.0] + - - [64, 12, 5040, 12, 64, 64, 64, 12] + - [193, 1713.0] + - - [64, 17, 3632, 17, 64, 64, 64, 17] + - [130, 2313.0] + - - [64, 19, 3264, 19, 64, 64, 64, 19] + - [144, 2808.0] + - - [64, 9, 6544, 9, 64, 64, 64, 9] + - [130, 1067.0] + - - [64, 7, 8192, 7, 64, 64, 64, 7] + - [130, 766.0] + - - [64, 16, 3840, 16, 64, 64, 64, 16] + - [130, 2372.0] + - - [64, 8, 7280, 8, 64, 64, 64, 8] + - [181, 969.0] + - - [64, 27, 2336, 27, 64, 64, 64, 27] + - [193, 3986.0] + - - [64, 11, 5456, 11, 64, 64, 64, 11] + - [181, 1425.0] + - - [64, 21, 2976, 21, 64, 64, 64, 21] + - [130, 2966.0] + - - [64, 10, 5952, 10, 64, 64, 64, 10] + - [170, 1184.0] + - - [64, 14, 4368, 14, 64, 64, 64, 14] + - [156, 2013.0] + - - [64, 25, 2512, 25, 64, 64, 64, 25] + - [193, 3758.0] + - - [64, 13, 4672, 13, 64, 64, 64, 13] + - [144, 1867.0] + - - [64, 15, 4096, 15, 64, 64, 64, 15] + - [170, 2251.0] + - - [64, 29, 2176, 29, 64, 64, 64, 29] + - [130, 4341.0] + - - [64, 18, 3440, 18, 64, 64, 64, 18] + - [130, 2291.0] + - - [64, 23, 2720, 23, 64, 64, 64, 23] + - [156, 3431.0] + - - [8, 500, 1, 512, 8, 8, 8, 512] + - [174, 292.0] + - - [32, 512, 1, 512, 32, 32, 32, 512] + - [148, 1237.0] + - - [8, 512, 1, 500, 8, 8, 8, 500] + - [184, 218.0] + - - [8, 500, 1, 1024, 8, 8, 8, 1024] + - [197, 285.0] + - - [64, 1024, 1, 100, 64, 64, 64, 100] + - [129, 1122.0] + - - [64, 1024, 1, 500, 64, 64, 64, 500] + - [170, 2344.0] + - - [64, 1024, 1, 1024, 64, 64, 64, 1024] + - [144, 3020.0] + - - [2, 500, 1, 2048, 2, 2, 2, 2048] + - [174, 91.0] + - - [16, 512, 1, 10, 16, 16, 16, 10] + - [133, 22.0] + - - [8, 512, 1, 10, 8, 8, 8, 10] + - [160, 11.0] + - - [16, 500, 1, 2048, 16, 16, 16, 2048] + - [147, 696.0] + - - [10, 100, 1, 500, 10, 10, 10, 500] + - [183, 66.0] + - - [16, 100, 1, 10, 16, 16, 16, 10] + - [127, 4.0] + - - [2, 100, 1, 2000, 2, 2, 2, 2000] + - [138, 17.0] + - - [256, 100, 1, 2048, 256, 256, 256, 2048] + - [134, 1889.0] + - - [2, 512, 1, 512, 2, 2, 2, 512] + - [174, 55.0] + - - [2, 100, 1, 10, 2, 2, 2, 10] + - [151, 1.0] + - - [200, 100, 1, 100, 200, 200, 200, 100] + - [167, 368.0] + - - [500, 100, 1, 100, 500, 500, 500, 100] + - [129, 842.0] + - - [4, 100, 1, 10, 4, 4, 4, 10] + - [127, 1.0] + - - [32, 100, 1, 512, 32, 32, 32, 512] + - [147, 171.0] + - - [16, 1024, 1, 512, 16, 16, 16, 512] + - [147, 824.0] + - - [4, 1024, 1, 1024, 4, 4, 4, 1024] + - [196, 256.0] + - - [4, 512, 1, 10, 4, 4, 4, 10] + - [127, 5.0] + - - [128, 100, 1, 10, 128, 128, 128, 10] + - [131, 31.0] + - - [4, 512, 1, 2048, 4, 4, 4, 2048] + - [148, 165.0] + - - [10, 1024, 1, 2000, 10, 10, 10, 2000] + - [133, 709.0] + - - [256, 100, 1, 100, 256, 256, 256, 100] + - [184, 766.0] + - - [64, 1024, 1, 2048, 64, 64, 64, 2048] + - [149, 2972.0] + - - [16, 1024, 1, 100, 16, 16, 16, 100] + - [150, 295.0] + - - [32, 1024, 1, 1024, 32, 32, 32, 1024] + - [148, 1796.0] + - - [8, 100, 1, 500, 8, 8, 8, 500] + - [174, 42.0] + - - [10, 512, 1, 512, 10, 10, 10, 512] + - [148, 281.0] + - - [8, 500, 1, 10, 8, 8, 8, 10] + - [129, 10.0] + - - [16, 1024, 1, 10, 16, 16, 16, 10] + - [176, 83.0] + - - [16, 512, 1, 2048, 16, 16, 16, 2048] + - [196, 711.0] + - - [128, 512, 1, 2048, 128, 128, 128, 2048] + - [149, 3126.0] + - - [128, 512, 1, 100, 128, 128, 128, 100] + - [155, 1689.0] + - - [64, 500, 1, 2048, 64, 64, 64, 2048] + - [197, 2114.0] + - - [500, 100, 1, 10, 500, 500, 500, 10] + - [129, 275.0] + - - [64, 100, 1, 2048, 64, 64, 64, 2048] + - [188, 602.0] + - - [64, 100, 1, 10, 64, 64, 64, 10] + - [184, 39.0] + - - [16, 512, 1, 500, 16, 16, 16, 500] + - [134, 557.0] + - - [200, 100, 1, 2000, 200, 200, 200, 2000] + - [160, 1641.0] + - - [2, 100, 1, 512, 2, 2, 2, 512] + - [186, 13.0] + - - [32, 512, 1, 100, 32, 32, 32, 100] + - [139, 443.0] + - - [16, 512, 1, 1024, 16, 16, 16, 1024] + - [196, 659.0] + - - [4, 1024, 1, 512, 4, 4, 4, 512] + - [173, 267.0] + - - [2, 500, 1, 500, 2, 2, 2, 500] + - [197, 68.0] + - - [32, 100, 1, 100, 32, 32, 32, 100] + - [139, 84.0] + - - [100, 500, 1, 2000, 100, 100, 100, 2000] + - [181, 2349.0] + - - [10, 512, 1, 10, 10, 10, 10, 10] + - [141, 28.0] + - - [100, 500, 1, 2048, 100, 100, 100, 2048] + - [149, 2373.0] + - - [2, 100, 1, 1024, 2, 2, 2, 1024] + - [148, 17.0] + - - [32, 512, 1, 1024, 32, 32, 32, 1024] + - [196, 1289.0] + - - [256, 100, 1, 1024, 256, 256, 256, 1024] + - [143, 1734.0] + - - [128, 100, 1, 100, 128, 128, 128, 100] + - [167, 427.0] + - - [32, 512, 1, 10, 32, 32, 32, 10] + - [160, 98.0] + - - [128, 100, 1, 1024, 128, 128, 128, 1024] + - [134, 1008.0] + - - [16, 500, 1, 2000, 16, 16, 16, 2000] + - [164, 681.0] + - - [64, 500, 1, 500, 64, 64, 64, 500] + - [160, 1802.0] + - - [128, 512, 1, 1024, 128, 128, 128, 1024] + - [144, 3372.0] + - - [128, 512, 1, 2000, 128, 128, 128, 2000] + - [156, 3195.0] + - - [2, 512, 1, 10, 2, 2, 2, 10] + - [136, 6.0] + - - [10, 512, 1, 500, 10, 10, 10, 500] + - [134, 346.0] + - - [4, 1024, 1, 2000, 4, 4, 4, 2000] + - [196, 309.0] + - - [256, 100, 1, 2000, 256, 256, 256, 2000] + - [134, 2063.0] + - - [100, 100, 1, 10, 100, 100, 100, 10] + - [127, 24.0] + - - [128, 512, 1, 10, 128, 128, 128, 10] + - [174, 352.0] + - - [256, 100, 1, 500, 256, 256, 256, 500] + - [160, 1654.0] + - - [64, 100, 1, 512, 64, 64, 64, 512] + - [148, 472.0] + - - [64, 512, 1, 500, 64, 64, 64, 500] + - [134, 1883.0] + - - [8, 100, 1, 512, 8, 8, 8, 512] + - [148, 57.0] + - - [32, 100, 1, 500, 32, 32, 32, 500] + - [197, 217.0] + - - [32, 500, 1, 2048, 32, 32, 32, 2048] + - [152, 1347.0] + - - [128, 500, 1, 2000, 128, 128, 128, 2000] + - [130, 3083.0] + - - [8, 1024, 1, 10, 8, 8, 8, 10] + - [134, 20.0] + - - [2, 500, 1, 100, 2, 2, 2, 100] + - [141, 32.0] + - - [10, 500, 1, 512, 10, 10, 10, 512] + - [174, 360.0] + - - [32, 500, 1, 500, 32, 32, 32, 500] + - [134, 1087.0] + - - [100, 500, 1, 100, 100, 100, 100, 100] + - [167, 1225.0] + - - [10, 1024, 1, 512, 10, 10, 10, 512] + - [147, 669.0] + - - [512, 100, 1, 512, 512, 512, 512, 512] + - [193, 2237.0] + - - [4, 500, 1, 500, 4, 4, 4, 500] + - [134, 136.0] + - - [64, 100, 1, 1024, 64, 64, 64, 1024] + - [196, 529.0] + - - [2, 500, 1, 2000, 2, 2, 2, 2000] + - [139, 86.0] + - - [32, 512, 1, 2048, 32, 32, 32, 2048] + - [148, 1432.0] + - - [10, 100, 1, 2000, 10, 10, 10, 2000] + - [138, 93.0] + - - [4, 100, 1, 512, 4, 4, 4, 512] + - [197, 29.0] + - - [2, 512, 1, 2048, 2, 2, 2, 2048] + - [148, 93.0] + - - [100, 100, 1, 2000, 100, 100, 100, 2000] + - [174, 880.0] + - - [10, 500, 1, 500, 10, 10, 10, 500] + - [134, 338.0] + - - [2, 100, 1, 2048, 2, 2, 2, 2048] + - [138, 19.0] + - - [32, 100, 1, 2048, 32, 32, 32, 2048] + - [139, 302.0] + - - [16, 100, 1, 1024, 16, 16, 16, 1024] + - [197, 137.0] + - - [2, 500, 1, 10, 2, 2, 2, 10] + - [133, 3.0] + - - [500, 100, 1, 2048, 500, 500, 500, 2048] + - [135, 2378.0] + - - [16, 1024, 1, 2000, 16, 16, 16, 2000] + - [133, 1141.0] + - - [10, 1024, 1, 1024, 10, 10, 10, 1024] + - [173, 636.0] + - - [500, 100, 1, 512, 500, 500, 500, 512] + - [192, 1847.0] + - - [32, 512, 1, 500, 32, 32, 32, 500] + - [134, 843.0] + - - [100, 500, 1, 512, 100, 100, 100, 512] + - [193, 1888.0] + - - [8, 500, 1, 2000, 8, 8, 8, 2000] + - [139, 310.0] + - - [4, 100, 1, 1024, 4, 4, 4, 1024] + - [148, 29.0] + - - [2, 500, 1, 1024, 2, 2, 2, 1024] + - [148, 70.0] + - - [100, 500, 1, 1024, 100, 100, 100, 1024] + - [144, 2282.0] + - - [32, 100, 1, 1024, 32, 32, 32, 1024] + - [164, 224.0] + - - [64, 100, 1, 2000, 64, 64, 64, 2000] + - [139, 536.0] + - - [64, 500, 1, 10, 64, 64, 64, 10] + - [155, 168.0] + - - [64, 500, 1, 512, 64, 64, 64, 512] + - [192, 1500.0] + - - [10, 100, 1, 1024, 10, 10, 10, 1024] + - [174, 72.0] + - - [16, 512, 1, 100, 16, 16, 16, 100] + - [171, 153.0] + - - [4, 100, 1, 2000, 4, 4, 4, 2000] + - [138, 34.0] + - - [2, 512, 1, 1024, 2, 2, 2, 1024] + - [174, 72.0] + - - [64, 512, 1, 1024, 64, 64, 64, 1024] + - [192, 1902.0] + - - [512, 100, 1, 2048, 512, 512, 512, 2048] + - [155, 2364.0] + - - [32, 100, 1, 2000, 32, 32, 32, 2000] + - [139, 270.0] + - - [4, 512, 1, 500, 4, 4, 4, 500] + - [183, 134.0] + - - [4, 500, 1, 1024, 4, 4, 4, 1024] + - [174, 139.0] + - - [32, 100, 1, 10, 32, 32, 32, 10] + - [190, 17.0] + - - [10, 1024, 1, 2048, 10, 10, 10, 2048] + - [173, 730.0] + - - [8, 500, 1, 100, 8, 8, 8, 100] + - [171, 76.0] + - - [200, 100, 1, 1024, 200, 200, 200, 1024] + - [134, 1298.0] + - - [16, 100, 1, 100, 16, 16, 16, 100] + - [178, 46.0] + - - [8, 1024, 1, 2000, 8, 8, 8, 2000] + - [133, 572.0] + - - [4, 512, 1, 100, 4, 4, 4, 100] + - [141, 65.0] + - - [16, 500, 1, 100, 16, 16, 16, 100] + - [189, 188.0] + - - [8, 1024, 1, 2048, 8, 8, 8, 2048] + - [173, 580.0] + - - [16, 1024, 1, 2048, 16, 16, 16, 2048] + - [147, 1159.0] + - - [64, 512, 1, 100, 64, 64, 64, 100] + - [134, 999.0] + - - [2, 100, 1, 500, 2, 2, 2, 500] + - [133, 10.0] + - - [2, 500, 1, 512, 2, 2, 2, 512] + - [174, 54.0] + - - [128, 500, 1, 1024, 128, 128, 128, 1024] + - [193, 2908.0] + - - [10, 100, 1, 10, 10, 10, 10, 10] + - [127, 2.0] + - - [64, 1024, 1, 10, 64, 64, 64, 10] + - [138, 260.0] + - - [500, 100, 1, 500, 500, 500, 500, 500] + - [155, 1806.0] + - - [2, 512, 1, 100, 2, 2, 2, 100] + - [174, 31.0] + - - [16, 100, 1, 500, 16, 16, 16, 500] + - [160, 82.0] + - - [128, 100, 1, 500, 128, 128, 128, 500] + - [148, 660.0] + - - [512, 100, 1, 1024, 512, 512, 512, 1024] + - [193, 2326.0] + - - [16, 100, 1, 2000, 16, 16, 16, 2000] + - [138, 134.0] + - - [10, 512, 1, 100, 10, 10, 10, 100] + - [199, 92.0] + - - [8, 512, 1, 100, 8, 8, 8, 100] + - [153, 75.0] + - - [128, 100, 1, 2000, 128, 128, 128, 2000] + - [160, 1015.0] + - - [2, 1024, 1, 2000, 2, 2, 2, 2000] + - [133, 142.0] + - - [100, 512, 1, 512, 100, 100, 100, 512] + - [170, 1916.0] + - - [32, 1024, 1, 2000, 32, 32, 32, 2000] + - [184, 2018.0] + - - [128, 500, 1, 100, 128, 128, 128, 100] + - [141, 1435.0] + - - [100, 100, 1, 100, 100, 100, 100, 100] + - [202, 229.0] + - - [8, 512, 1, 1024, 8, 8, 8, 1024] + - [197, 341.0] + - - [200, 100, 1, 500, 200, 200, 200, 500] + - [160, 1292.0] + - - [2, 1024, 1, 2048, 2, 2, 2, 2048] + - [147, 158.0] + - - [512, 100, 1, 2000, 512, 512, 512, 2000] + - [180, 2496.0] + - - [16, 512, 1, 2000, 16, 16, 16, 2000] + - [148, 688.0] + - - [64, 500, 1, 1024, 64, 64, 64, 1024] + - [192, 2133.0] + - - [10, 512, 1, 1024, 10, 10, 10, 1024] + - [197, 421.0] + - - [512, 100, 1, 100, 512, 512, 512, 100] + - [184, 1384.0] + - - [8, 100, 1, 1024, 8, 8, 8, 1024] + - [164, 66.0] + - - [10, 100, 1, 100, 10, 10, 10, 100] + - [152, 26.0] + - - [10, 500, 1, 2000, 10, 10, 10, 2000] + - [164, 387.0] + - - [500, 100, 1, 2000, 500, 500, 500, 2000] + - [155, 2299.0] + - - [100, 512, 1, 2000, 100, 100, 100, 2000] + - [149, 2308.0] + - - [64, 1024, 1, 512, 64, 64, 64, 512] + - [170, 2558.0] + - - [32, 500, 1, 100, 32, 32, 32, 100] + - [141, 295.0] + - - [10, 100, 1, 2048, 10, 10, 10, 2048] + - [152, 85.0] + - - [64, 100, 1, 100, 64, 64, 64, 100] + - [167, 116.0] + - - [2, 1024, 1, 100, 2, 2, 2, 100] + - [129, 36.0] + - - [64, 500, 1, 2000, 64, 64, 64, 2000] + - [184, 1963.0] + - - [8, 512, 1, 512, 8, 8, 8, 512] + - [174, 221.0] + - - [8, 512, 1, 2048, 8, 8, 8, 2048] + - [174, 327.0] + - - [100, 100, 1, 1024, 100, 100, 100, 1024] + - [174, 697.0] + - - [8, 100, 1, 2000, 8, 8, 8, 2000] + - [138, 67.0] + - - [2, 1024, 1, 1024, 2, 2, 2, 1024] + - [147, 127.0] + - - [16, 512, 1, 512, 16, 16, 16, 512] + - [148, 436.0] + - - [32, 500, 1, 512, 32, 32, 32, 512] + - [197, 862.0] + - - [32, 500, 1, 1024, 32, 32, 32, 1024] + - [174, 1098.0] + - - [32, 500, 1, 10, 32, 32, 32, 10] + - [133, 39.0] + - - [4, 1024, 1, 500, 4, 4, 4, 500] + - [133, 196.0] + - - [256, 100, 1, 512, 256, 256, 256, 512] + - [160, 1285.0] + - - [8, 1024, 1, 500, 8, 8, 8, 500] + - [196, 396.0] + - - [4, 1024, 1, 100, 4, 4, 4, 100] + - [187, 106.0] + - - [100, 500, 1, 500, 100, 100, 100, 500] + - [144, 1746.0] + - - [2, 1024, 1, 500, 2, 2, 2, 500] + - [159, 99.0] + - - [64, 100, 1, 500, 64, 64, 64, 500] + - [174, 335.0] + - - [2, 512, 1, 500, 2, 2, 2, 500] + - [134, 52.0] + - - [10, 1024, 1, 500, 10, 10, 10, 500] + - [159, 496.0] + - - [128, 500, 1, 512, 128, 128, 128, 512] + - [144, 2371.0] + - - [10, 500, 1, 2048, 10, 10, 10, 2048] + - [197, 406.0] + - - [128, 512, 1, 512, 128, 128, 128, 512] + - [170, 2497.0] + - - [64, 512, 1, 10, 64, 64, 64, 10] + - [127, 171.0] + - - [32, 500, 1, 2000, 32, 32, 32, 2000] + - [164, 1238.0] + - - [100, 100, 1, 2048, 100, 100, 100, 2048] + - [174, 809.0] + - - [200, 100, 1, 512, 200, 200, 200, 512] + - [141, 1008.0] + - - [200, 100, 1, 2048, 200, 200, 200, 2048] + - [134, 1503.0] + - - [8, 100, 1, 10, 8, 8, 8, 10] + - [127, 2.0] + - - [100, 100, 1, 500, 100, 100, 100, 500] + - [148, 523.0] + - - [100, 500, 1, 10, 100, 100, 100, 10] + - [143, 116.0] + - - [10, 500, 1, 1024, 10, 10, 10, 1024] + - [148, 346.0] + - - [256, 100, 1, 10, 256, 256, 256, 10] + - [134, 62.0] + - - [10, 512, 1, 2048, 10, 10, 10, 2048] + - [174, 413.0] + - - [2, 1024, 1, 512, 2, 2, 2, 512] + - [147, 101.0] + - - [4, 500, 1, 2048, 4, 4, 4, 2048] + - [197, 178.0] + - - [100, 512, 1, 100, 100, 100, 100, 100] + - [155, 818.0] + - - [16, 500, 1, 512, 16, 16, 16, 512] + - [174, 425.0] + - - [10, 1024, 1, 100, 10, 10, 10, 100] + - [199, 188.0] + - - [8, 1024, 1, 100, 8, 8, 8, 100] + - [133, 146.0] + - - [64, 1024, 1, 2000, 64, 64, 64, 2000] + - [181, 3041.0] + - - [10, 100, 1, 512, 10, 10, 10, 512] + - [148, 53.0] + - - [4, 500, 1, 2000, 4, 4, 4, 2000] + - [197, 155.0] + - - [4, 100, 1, 100, 4, 4, 4, 100] + - [127, 7.0] + - - [32, 1024, 1, 512, 32, 32, 32, 512] + - [190, 1472.0] + - - [8, 512, 1, 2000, 8, 8, 8, 2000] + - [139, 318.0] + - - [100, 100, 1, 512, 100, 100, 100, 512] + - [148, 539.0] + - - [2, 512, 1, 2000, 2, 2, 2, 2000] + - [188, 80.0] + - - [16, 500, 1, 10, 16, 16, 16, 10] + - [140, 41.0] + - - [10, 500, 1, 100, 10, 10, 10, 100] + - [167, 89.0] + - - [4, 100, 1, 500, 4, 4, 4, 500] + - [134, 21.0] + - - [64, 500, 1, 100, 64, 64, 64, 100] + - [127, 909.0] + - - [2, 100, 1, 100, 2, 2, 2, 100] + - [127, 4.0] + - - [10, 512, 1, 2000, 10, 10, 10, 2000] + - [164, 395.0] + - - [8, 500, 1, 500, 8, 8, 8, 500] + - [184, 203.0] + - - [4, 500, 1, 512, 4, 4, 4, 512] + - [174, 107.0] + - - [10, 500, 1, 10, 10, 10, 10, 10] + - [131, 12.0] + - - [64, 512, 1, 2000, 64, 64, 64, 2000] + - [174, 2021.0] + - - [32, 512, 1, 2000, 32, 32, 32, 2000] + - [164, 1398.0] + - - [128, 500, 1, 2048, 128, 128, 128, 2048] + - [198, 2854.0] + - - [4, 512, 1, 512, 4, 4, 4, 512] + - [174, 109.0] + - - [16, 500, 1, 1024, 16, 16, 16, 1024] + - [174, 570.0] + - - [10, 1024, 1, 10, 10, 10, 10, 10] + - [136, 25.0] + - - [16, 500, 1, 500, 16, 16, 16, 500] + - [160, 405.0] + - - [500, 100, 1, 1024, 500, 500, 500, 1024] + - [144, 2220.0] + - - [16, 100, 1, 512, 16, 16, 16, 512] + - [174, 85.0] + - - [64, 512, 1, 2048, 64, 64, 64, 2048] + - [148, 2004.0] + - - [32, 1024, 1, 10, 32, 32, 32, 10] + - [127, 171.0] + - - [8, 1024, 1, 512, 8, 8, 8, 512] + - [147, 415.0] + - - [4, 1024, 1, 2048, 4, 4, 4, 2048] + - [173, 289.0] + - - [128, 500, 1, 500, 128, 128, 128, 500] + - [142, 2232.0] + - - [100, 512, 1, 1024, 100, 100, 100, 1024] + - [144, 2334.0] + - - [16, 1024, 1, 500, 16, 16, 16, 500] + - [183, 795.0] + - - [128, 100, 1, 2048, 128, 128, 128, 2048] + - [160, 1037.0] + - - [100, 512, 1, 500, 100, 100, 100, 500] + - [193, 1783.0] + - - [8, 1024, 1, 1024, 8, 8, 8, 1024] + - [173, 504.0] + - - [4, 500, 1, 10, 4, 4, 4, 10] + - [127, 5.0] + - - [128, 500, 1, 10, 128, 128, 128, 10] + - [190, 156.0] + - - [32, 1024, 1, 100, 32, 32, 32, 100] + - [186, 591.0] + - - [8, 500, 1, 2048, 8, 8, 8, 2048] + - [197, 321.0] + - - [16, 1024, 1, 1024, 16, 16, 16, 1024] + - [173, 1027.0] + - - [200, 100, 1, 10, 200, 200, 200, 10] + - [174, 51.0] + - - [512, 100, 1, 500, 512, 512, 512, 500] + - [169, 1866.0] + - - [4, 500, 1, 100, 4, 4, 4, 100] + - [162, 36.0] + - - [8, 100, 1, 2048, 8, 8, 8, 2048] + - [177, 68.0] + - - [512, 100, 1, 10, 512, 512, 512, 10] + - [131, 129.0] + - - [4, 512, 1, 1024, 4, 4, 4, 1024] + - [148, 142.0] + - - [32, 1024, 1, 2048, 32, 32, 32, 2048] + - [148, 2016.0] + - - [128, 100, 1, 512, 128, 128, 128, 512] + - [148, 705.0] + - - [32, 1024, 1, 500, 32, 32, 32, 500] + - [134, 1455.0] + - - [4, 1024, 1, 10, 4, 4, 4, 10] + - [127, 10.0] + - - [100, 512, 1, 10, 100, 100, 100, 10] + - [157, 122.0] + - - [8, 100, 1, 100, 8, 8, 8, 100] + - [169, 15.0] + - - [128, 512, 1, 500, 128, 128, 128, 500] + - [144, 2311.0] + - - [16, 100, 1, 2048, 16, 16, 16, 2048] + - [164, 138.0] + - - [2, 1024, 1, 10, 2, 2, 2, 10] + - [127, 5.0] + - - [4, 100, 1, 2048, 4, 4, 4, 2048] + - [138, 34.0] + - - [4, 512, 1, 2000, 4, 4, 4, 2000] + - [139, 159.0] + - - [1024, 29, 1, 1024, 1024, 1024, 1024, 1024] + - [174, 1631.0] + - - [1024, 1, 1, 21, 1024, 1024, 1024, 21] + - [127, 5.0] + - - [1024, 49, 1, 1024, 1024, 1024, 1024, 1024] + - [185, 2055.0] + - - [1024, 35, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 1634.0] + - - [1024, 24, 1, 1024, 1024, 1024, 1024, 1024] + - [160, 1353.0] + - - [1024, 21, 1, 1024, 1024, 1024, 1024, 1024] + - [160, 1180.0] + - - [1024, 1, 1, 14, 1024, 1024, 1024, 14] + - [127, 3.0] + - - [1024, 91, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 3389.0] + - - [1024, 14, 1, 1024, 1024, 1024, 1024, 1024] + - [152, 951.0] + - - [1024, 25, 1, 1024, 1024, 1024, 1024, 1024] + - [160, 1406.0] + - - [1024, 27, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 1752.0] + - - [1024, 50, 1, 1024, 1024, 1024, 1024, 1024] + - [135, 2097.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [161, 2687.0] + - - [1024, 13, 1, 1024, 1024, 1024, 1024, 1024] + - [164, 906.0] + - - [1024, 63, 1, 1024, 1024, 1024, 1024, 1024] + - [198, 2621.0] + - - [1024, 86, 1, 1024, 1024, 1024, 1024, 1024] + - [156, 3209.0] + - - [1024, 1, 1, 13, 1024, 1024, 1024, 13] + - [180, 7.0] + - - [289, 192, 1, 1344, 289, 289, 289, 1344] + - [193, 2779.0] + - - [196, 128, 1, 800, 196, 196, 196, 800] + - [134, 1828.0] + - - [64, 512, 1, 1344, 64, 64, 64, 1344] + - [184, 1910.0] + - - [289, 224, 1, 1568, 289, 289, 289, 1568] + - [193, 2951.0] + - - [64, 256, 1, 1536, 64, 64, 64, 1536] + - [184, 1255.0] + - - [289, 160, 1, 1120, 289, 289, 289, 1120] + - [192, 2213.0] + - - [64, 256, 1, 1152, 64, 64, 64, 1152] + - [134, 1181.0] + - - [289, 224, 1, 1344, 289, 289, 289, 1344] + - [193, 2986.0] + - - [289, 192, 1, 896, 289, 289, 289, 896] + - [144, 2416.0] + - - [784, 16, 32, 192, 784, 784, 784, 192] + - [192, 2902.0] + - - [49, 128, 1, 1200, 49, 49, 49, 1200] + - [152, 535.0] + - - [289, 128, 1, 896, 289, 289, 289, 896] + - [134, 2311.0] + - - [1001, 32, 1, 1024, 1001, 1001, 1001, 1024] + - [148, 2025.0] + - - [64, 448, 1, 1152, 64, 64, 64, 1152] + - [134, 1903.0] + - - [1001, 32, 1, 2048, 1001, 1001, 1001, 2048] + - [134, 1974.0] + - - [289, 192, 1, 1120, 289, 289, 289, 1120] + - [193, 2577.0] + - - [64, 320, 1, 1728, 64, 64, 64, 1728] + - [174, 1614.0] + - - [289, 96, 1, 864, 289, 289, 289, 864] + - [197, 1978.0] + - - [196, 64, 1, 800, 196, 196, 196, 800] + - [197, 815.0] + - - [784, 32, 1, 400, 784, 784, 784, 400] + - [183, 1363.0] + - - [64, 320, 1, 2880, 64, 64, 64, 2880] + - [174, 1918.0] + - - [1001, 32, 1, 1536, 1001, 1001, 1001, 1536] + - [134, 2099.0] + - - [64, 384, 1, 1152, 64, 64, 64, 1152] + - [184, 1953.0] + - - [64, 192, 1, 1728, 64, 64, 64, 1728] + - [197, 985.0] + - - [1001, 64, 1, 1536, 1001, 1001, 1001, 1536] + - [149, 2828.0] + - - [1001, 64, 1, 2048, 1001, 1001, 1001, 2048] + - [149, 2891.0] + - - [1024, 64, 1, 4096, 1024, 1024, 1024, 4096] + - [149, 3244.0] + - - [64, 10, 448, 10, 64, 64, 64, 10] + - [130, 452.0] + - - [64, 18, 648, 18, 64, 64, 64, 18] + - [130, 1328.0] + - - [64, 18, 1720, 18, 64, 64, 64, 18] + - [170, 1743.0] + - - [64, 19, 1632, 19, 64, 64, 64, 19] + - [181, 1872.0] + - - [64, 21, 1472, 21, 64, 64, 64, 21] + - [181, 2133.0] + - - [64, 23, 64, 23, 64, 64, 64, 23] + - [127, 455.0] + - - [64, 26, 56, 26, 64, 64, 64, 26] + - [180, 531.0] + - - [1024, 1, 1, 2, 1024, 1024, 1024, 2] + - [127, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [164, 68.0] + - - [64, 27, 56, 26, 64, 64, 64, 26] + - [155, 505.0] + - - [64, 17, 1, 17, 64, 64, 64, 17] + - [127, 4.0] + - - [64, 30, 1, 30, 64, 64, 64, 30] + - [136, 13.0] + - - [64, 31, 1, 30, 64, 64, 64, 30] + - [187, 14.0] + - - [64, 31, 1, 31, 64, 64, 64, 31] + - [152, 14.0] + - - [64, 14, 1, 14, 64, 64, 64, 14] + - [127, 3.0] + - - [64, 14, 1, 15, 64, 64, 64, 15] + - [147, 4.0] + - - [64, 15, 1, 15, 64, 64, 64, 15] + - [127, 3.0] + - - [64, 15, 1, 17, 64, 64, 64, 17] + - [154, 5.0] + - - [100, 512, 1, 2048, 100, 100, 100, 2048] + - [149, 2302.0] + - - [1024, 1, 1, 1600, 1024, 1024, 1024, 1600] + - [164, 77.0] + - - [1024, 1, 1, 200, 1024, 1024, 1024, 200] + - [148, 32.0] + - - [1, 200, 1, 1, 1, 1, 1, 1] + - [127, 0.05] + - - [1, 512, 1, 1, 1, 1, 1, 1] + - [178, 0.14] + - - [67, 512, 1, 2048, 67, 67, 67, 2048] + - [143, 1767.0] + - - [74, 512, 1, 2048, 74, 74, 74, 2048] + - [192, 1933.0] + - - [64, 3, 512, 3, 64, 64, 64, 3] + - [196, 73.0] + - - [64, 5, 512, 5, 64, 64, 64, 5] + - [130, 144.0] + - - [64, 9, 512, 9, 64, 64, 64, 9] + - [181, 408.0] + - - [64, 512, 1, 512, 64, 64, 64, 512] + - [192, 1517.0] + - - [25, 128, 120, 256, 25, 25, 25, 256] + - [168, 2641.0] + - - [25, 128, 139, 256, 25, 25, 25, 256] + - [168, 2732.0] + - - [25, 128, 160, 256, 25, 25, 25, 256] + - [142, 3034.0] + - - [25, 128, 18, 256, 25, 25, 25, 256] + - [142, 1415.0] + - - [25, 128, 19, 256, 25, 25, 25, 256] + - [142, 1474.0] + - - [9, 128, 120, 256, 9, 9, 9, 256] + - [147, 1111.0] + - - [9, 128, 139, 256, 9, 9, 9, 256] + - [196, 1204.0] + - - [9, 128, 160, 256, 9, 9, 9, 256] + - [173, 1252.0] + - - [9, 128, 18, 256, 9, 9, 9, 256] + - [141, 633.0] + - - [9, 128, 19, 256, 9, 9, 9, 256] + - [167, 641.0] + - - [1, 256, 1, 1152, 1, 1, 1, 1152] + - [148, 19.0] + - - [100, 512, 1, 2304, 100, 100, 100, 2304] + - [149, 2329.0] + - - [25, 256, 1, 1152, 25, 25, 25, 1152] + - [174, 465.0] + - - [9, 256, 1, 1152, 9, 9, 9, 1152] + - [197, 169.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 2892.0] + - - [1024, 10, 1, 2, 1024, 1024, 1024, 2] + - [127, 5.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [164, 695.0] + - - [1024, 39, 1, 2, 1024, 1024, 1024, 2] + - [127, 48.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [174, 1829.0] + - - [1024, 40, 1, 2, 1024, 1024, 1024, 2] + - [149, 42.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [134, 1874.0] + - - [1024, 41, 1, 2, 1024, 1024, 1024, 2] + - [127, 49.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [184, 1928.0] + - - [1024, 5, 1, 2, 1024, 1024, 1024, 2] + - [129, 3.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [152, 339.0] + - - [1024, 8, 1, 2, 1024, 1024, 1024, 2] + - [127, 4.0] + - - [1024, 9, 1, 2, 1024, 1024, 1024, 2] + - [127, 12.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [188, 615.0] + - - [64, 4, 32768, 4, 64, 64, 64, 4] + - [181, 362.0] + - - [64, 4, 38400, 4, 64, 64, 64, 4] + - [181, 333.0] + - - [64, 14, 10880, 14, 64, 64, 64, 14] + - [156, 2238.0] + - - [64, 14, 10880, 15, 64, 64, 64, 15] + - [181, 2263.0] + - - [64, 15, 7680, 15, 64, 64, 64, 15] + - [144, 2491.0] + - - [64, 15, 10880, 15, 64, 64, 64, 15] + - [130, 2380.0] + - - [64, 15, 7680, 17, 64, 64, 64, 17] + - [155, 2617.0] + - - [64, 17, 6144, 17, 64, 64, 64, 17] + - [130, 2600.0] + - - [64, 17, 7680, 17, 64, 64, 64, 17] + - [156, 2623.0] + - - [64, 17, 6144, 21, 64, 64, 64, 21] + - [156, 2793.0] + - - [64, 21, 6144, 21, 64, 64, 64, 21] + - [181, 3415.0] + - - [64, 24, 4736, 24, 64, 64, 64, 24] + - [130, 3972.0] + - - [64, 24, 4736, 34, 64, 64, 64, 34] + - [156, 3835.0] + - - [64, 30, 2048, 30, 64, 64, 64, 30] + - [144, 4462.0] + - - [64, 31, 2048, 30, 64, 64, 64, 30] + - [156, 4714.0] + - - [64, 31, 2048, 31, 64, 64, 64, 31] + - [170, 4778.0] + - - [128, 128, 1, 64, 128, 128, 128, 64] + - [167, 468.0] + - - [64, 5, 1, 5, 64, 64, 64, 5] + - [127, 1.0] + - - [32, 33, 1, 33, 32, 32, 32, 33] + - [129, 17.0] + - - [64, 5, 960, 5, 64, 64, 64, 5] + - [156, 324.0] + - - [74, 960, 1, 2048, 74, 74, 74, 2048] + - [193, 2774.0] + - - [128, 27, 32768, 27, 128, 128, 128, 27] + - [137, 1925.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [174, 1273.0] + - - [1024, 16, 1, 2, 1024, 1024, 1024, 2] + - [129, 23.0] + - - [1024, 64, 1, 2, 1024, 1024, 1024, 2] + - [129, 83.0] + - - [1024, 80, 1, 2, 1024, 1024, 1024, 2] + - [131, 101.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [130, 3431.0] + - - [1024, 82, 1, 2, 1024, 1024, 1024, 2] + - [129, 41.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [201, 822.0] + - - [1024, 12, 1, 2, 1024, 1024, 1024, 2] + - [127, 6.0] + - - [64, 24, 6816, 24, 64, 64, 64, 24] + - [193, 2709.0] + - - [64, 26, 6272, 26, 64, 64, 64, 26] + - [135, 2825.0] + - - [196, 256, 1, 2304, 196, 196, 196, 2304] + - [155, 2409.0] + - - [850, 3, 2, 256, 850, 850, 850, 256] + - [134, 263.0] + - - [850, 12, 2, 256, 850, 850, 850, 256] + - [134, 1049.0] + - - [805, 12, 2, 256, 805, 805, 805, 256] + - [174, 665.0] + - - [805, 3, 2, 256, 805, 805, 805, 256] + - [134, 164.0] + - - [768, 3, 2, 256, 768, 768, 768, 256] + - [160, 159.0] + - - [768, 12, 2, 256, 768, 768, 768, 256] + - [148, 655.0] + - - [864, 12, 2, 256, 864, 864, 864, 256] + - [160, 715.0] + - - [864, 3, 2, 256, 864, 864, 864, 256] + - [148, 178.0] + - - [247, 3, 2, 256, 247, 247, 247, 256] + - [134, 53.0] + - - [216, 3, 2, 256, 216, 216, 216, 256] + - [133, 46.0] + - - [950, 3, 2, 256, 950, 950, 950, 256] + - [134, 193.0] + - - [187, 12, 2, 256, 187, 187, 187, 256] + - [196, 167.0] + - - [176, 12, 2, 256, 176, 176, 176, 256] + - [197, 153.0] + - - [247, 12, 2, 256, 247, 247, 247, 256] + - [197, 213.0] + - - [187, 3, 2, 256, 187, 187, 187, 256] + - [134, 40.0] + - - [228, 12, 2, 256, 228, 228, 228, 256] + - [202, 219.0] + - - [221, 12, 2, 256, 221, 221, 221, 256] + - [162, 270.0] + - - [176, 3, 2, 256, 176, 176, 176, 256] + - [148, 57.0] + - - [950, 12, 2, 256, 950, 950, 950, 256] + - [160, 762.0] + - - [192, 12, 2, 256, 192, 192, 192, 256] + - [202, 182.0] + - - [228, 3, 2, 256, 228, 228, 228, 256] + - [148, 52.0] + - - [221, 3, 2, 256, 221, 221, 221, 256] + - [173, 49.0] + - - [192, 3, 2, 256, 192, 192, 192, 256] + - [196, 43.0] + - - [216, 12, 2, 256, 216, 216, 216, 256] + - [174, 194.0] + - - [2, 6, 1, 1024, 2, 2, 2, 1024] + - [127, 1.0] + - - [1024, 20, 1, 2, 1024, 1024, 1024, 2] + - [127, 10.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB.yaml new file mode 100644 index 000000000..735dbb8e9 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB.yaml @@ -0,0 +1,21036 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 1 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 1 + LVCB: 8 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 1 + LVCB: 8 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 1 + LVCB: 8 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 1 + LVCB: 8 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 4288] + - [24, 24104.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 5888] + - [9, 23351.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 5056] + - [4, 24415.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1024] + - [21, 23771.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 1856] + - [13, 23932.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 704] + - [13, 17700.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 2944] + - [21, 25197.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 4288] + - [16, 21633.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 5056] + - [15, 24867.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 5888] + - [24, 24264.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3584] + - [4, 19601.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1408] + - [21, 24136.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 1856] + - [2, 23946.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 6784] + - [13, 24915.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 5056] + - [24, 24752.0] + - - [448, 5056, 1, 256, 448, 448, 256, 5056] + - [16, 17320.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 448] + - [20, 18995.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 704] + - [24, 21336.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 1024] + - [28, 20604.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 4288] + - [13, 24735.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 2368] + - [2, 21911.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 2944] + - [13, 25103.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 1024] + - [16, 22626.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 2944] + - [7, 22071.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 5056] + - [24, 25210.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 5056] + - [7, 23431.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 3584] + - [18, 22837.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 2944] + - [7, 23861.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 4288] + - [21, 23677.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 4288] + - [4, 24511.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 704] + - [11, 19466.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 4288] + - [18, 24746.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 2368] + - [2, 23918.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 448] + - [20, 19600.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 1408] + - [21, 20016.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 2944] + - [7, 22771.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 2368] + - [21, 23994.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 3584] + - [0, 19086.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 5888] + - [15, 24985.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 1408] + - [6, 18165.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 2368] + - [2, 22411.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 704] + - [4, 22112.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 1856] + - [20, 17945.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 1856] + - [27, 21826.0] + - - [704, 5888, 1, 256, 704, 704, 256, 5888] + - [16, 18889.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 6784] + - [21, 24775.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 704] + - [2, 20357.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 1408] + - [7, 17795.0] + - - [448, 4288, 1, 256, 448, 448, 256, 4288] + - [14, 15563.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 2368] + - [4, 18834.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 2368] + - [21, 21380.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1024] + - [27, 19144.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 704] + - [13, 18938.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 3584] + - [19, 21459.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 4288] + - [15, 24803.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1856] + - [16, 23742.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 1024] + - [4, 24591.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 3584] + - [2, 23710.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3584] + - [21, 24577.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 2944] + - [7, 24478.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 2368] + - [1, 18949.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 2368] + - [21, 23624.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 2368] + - [27, 19788.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 6784] + - [21, 25081.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 1856] + - [21, 24154.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 4288] + - [24, 24609.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 5056] + - [15, 24393.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 5888] + - [15, 25547.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 5056] + - [15, 24035.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 5056] + - [16, 23904.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 1024] + - [5, 21759.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 1408] + - [7, 20381.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 448] + - [20, 19243.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 5888] + - [4, 22482.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 6784] + - [16, 23865.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 2944] + - [16, 24404.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 1024] + - [4, 22470.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 5056] + - [18, 24759.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 1856] + - [15, 21427.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 2368] + - [2, 19773.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 4288] + - [15, 24435.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 6784] + - [2, 22459.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 5888] + - [24, 25513.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1024] + - [0, 23994.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 5888] + - [0, 19716.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 5888] + - [24, 25101.0] + - - [448, 6784, 1, 256, 448, 448, 256, 6784] + - [16, 17985.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 5888] + - [2, 24326.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 3584] + - [7, 23529.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 2944] + - [21, 25253.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 5056] + - [24, 25084.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 2368] + - [13, 24147.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 5888] + - [2, 24612.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 4288] + - [7, 23414.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1856] + - [2, 23142.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 2944] + - [4, 23138.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 6784] + - [9, 20860.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 1024] + - [5, 20761.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1408] + - [2, 19794.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 1856] + - [13, 23926.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 1408] + - [23, 17171.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 256] + - [5, 22099.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 3584] + - [25, 23770.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1024] + - [5, 21976.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1856] + - [15, 20779.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 6784] + - [21, 25271.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 1024] + - [0, 22727.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 1024] + - [24, 22208.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 5888] + - [15, 25211.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 1024] + - [3, 18388.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 1408] + - [21, 24527.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 1024] + - [24, 23591.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 1408] + - [7, 23614.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 4288] + - [4, 24455.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 2944] + - [27, 23406.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 1856] + - [7, 19815.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3584] + - [4, 24163.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 6784] + - [7, 23468.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1408] + - [27, 24215.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 5888] + - [15, 24647.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 5056] + - [11, 22027.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 6784] + - [13, 24806.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 4288] + - [2, 23351.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1856] + - [13, 23611.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 5056] + - [7, 24260.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 448] + - [20, 16929.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 3584] + - [4, 24757.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [14, 18540.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3584] + - [2, 23968.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 256] + - [8, 20388.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 3584] + - [27, 23180.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 4288] + - [27, 24688.0] + - - [704, 5056, 1, 256, 704, 704, 256, 5056] + - [19, 19232.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 2368] + - [13, 23738.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 3584] + - [7, 24381.0] + - - [704, 6784, 1, 256, 704, 704, 256, 6784] + - [14, 19822.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3584] + - [9, 23175.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 2944] + - [21, 24545.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 6784] + - [27, 24182.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 4288] + - [2, 24020.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 256] + - [28, 20257.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 704] + - [15, 19865.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 6784] + - [27, 24394.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 1856] + - [16, 22239.0] + - - [704, 4288, 1, 256, 704, 704, 256, 4288] + - [23, 17981.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 6784] + - [24, 24317.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 2368] + - [13, 24170.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 4288] + - [28, 22650.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 3584] + - [4, 23859.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1408] + - [2, 19268.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 6784] + - [21, 24289.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 704] + - [19, 21029.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 4288] + - [15, 23586.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 5888] + - [24, 22636.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 704] + - [21, 19650.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 448] + - [28, 19015.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 2368] + - [24, 19894.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 5056] + - [5, 20760.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 448] + - [20, 15349.0] + - - [448, 5888, 1, 256, 448, 448, 256, 5888] + - [10, 16217.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 2368] + - [27, 23100.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 704] + - [4, 22546.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 2944] + - [13, 23666.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 704] + - [14, 14985.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 2368] + - [27, 21618.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 5056] + - [24, 25066.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3584] + - [24, 24940.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 6784] + - [27, 24835.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 2944] + - [21, 24301.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 704] + - [2, 20031.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 5056] + - [19, 20155.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 5888] + - [4, 25480.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 4288] + - [13, 24610.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 1024] + - [4, 23268.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1856] + - [27, 20354.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 448] + - [24, 19395.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 5888] + - [0, 21672.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 6784] + - [4, 24145.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 2944] + - [13, 20273.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 2944] + - [21, 24882.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 1408] + - [2, 20403.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 4288] + - [13, 19139.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 704] + - [26, 17252.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 1408] + - [21, 23596.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1024] + - [19, 21209.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 6784] + - [16, 22909.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 448] + - [2, 19635.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 3584] + - [4, 24757.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 1024] + - [2, 24048.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 6784] + - [22, 21175.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 2944] + - [24, 22282.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 5056] + - [24, 21420.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 5888] + - [16, 23833.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 1856] + - [27, 20953.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 5056] + - [16, 23680.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 5056] + - [15, 25181.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 6784] + - [21, 24972.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 1856] + - [7, 21809.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 5888] + - [2, 22834.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 4288] + - [15, 24628.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1408] + - [13, 23508.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 2368] + - [2, 22179.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 5056] + - [24, 24902.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 2368] + - [21, 24288.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 1856] + - [2, 23338.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 2944] + - [24, 22795.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1024] + - [15, 22893.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 4288] + - [27, 19387.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3584] + - [21, 25082.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3584] + - [24, 25144.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1408] + - [2, 21745.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 2944] + - [13, 24728.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 6784] + - [7, 22858.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 2944] + - [16, 23525.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1408] + - [13, 23555.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 6784] + - [22, 25086.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 4288] + - [27, 22818.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 6784] + - [21, 24203.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 1408] + - [16, 21415.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 5888] + - [24, 24848.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1024] + - [4, 23286.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 6784] + - [2, 20506.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1408] + - [13, 23113.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 1856] + - [2, 22970.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 5888] + - [21, 25027.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1856] + - [13, 24030.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 256] + - [7, 21254.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 5888] + - [13, 23987.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 1408] + - [7, 22235.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3584] + - [2, 20528.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 448] + - [16, 19298.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 704] + - [12, 17113.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 1024] + - [11, 19294.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 4288] + - [13, 23636.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 5056] + - [7, 24274.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 5056] + - [24, 25113.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 5888] + - [16, 23812.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 6784] + - [2, 24492.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 2368] + - [2, 22139.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 3584] + - [21, 23695.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 3584] + - [24, 25024.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [14, 18588.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 1408] + - [2, 21108.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 4288] + - [4, 23695.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 2368] + - [27, 23856.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1856] + - [7, 21062.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 1856] + - [13, 22215.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 5888] + - [2, 21709.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 2368] + - [27, 18579.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 704] + - [13, 21100.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 2944] + - [21, 24368.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 256] + - [4, 21266.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 5056] + - [16, 22372.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [8, 16069.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 4288] + - [16, 21000.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 2368] + - [13, 24234.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 5888] + - [15, 24442.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 256] + - [7, 20693.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 1856] + - [2, 21909.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 704] + - [12, 18183.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 6784] + - [2, 24206.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 4288] + - [24, 23261.0] + - - [704, 3584, 1, 256, 704, 704, 256, 3584] + - [3, 18164.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 2944] + - [2, 21420.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 5056] + - [4, 24118.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 5056] + - [24, 24907.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 1024] + - [4, 22728.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 6784] + - [27, 23333.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 1408] + - [2, 24419.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 2368] + - [4, 21023.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 2944] + - [7, 25195.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 448] + - [20, 18728.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 6784] + - [21, 25269.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 5056] + - [0, 19995.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 704] + - [17, 18748.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 5888] + - [7, 21287.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 4288] + - [27, 23933.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [14, 18463.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1024] + - [24, 24103.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 704] + - [13, 20748.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 3584] + - [21, 25047.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 2944] + - [13, 23552.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 2368] + - [2, 22448.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 3584] + - [7, 19830.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 2944] + - [7, 21261.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 5888] + - [7, 24420.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 1856] + - [16, 21732.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 4288] + - [2, 20500.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 2944] + - [13, 24097.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 5056] + - [15, 24800.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 5056] + - [0, 21516.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 2368] + - [27, 22731.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 704] + - [2, 20357.0] + - - [448, 3584, 1, 256, 448, 448, 256, 3584] + - [17, 14893.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 5888] + - [16, 25090.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 3584] + - [13, 24339.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 1856] + - [2, 21277.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1408] + - [21, 24240.0] + - - [704, 2944, 1, 256, 704, 704, 256, 2944] + - [17, 18436.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 5888] + - [21, 25212.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 6784] + - [24, 24561.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 4288] + - [2, 19463.0] + - - [704, 2368, 1, 256, 704, 704, 256, 2368] + - [23, 15429.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 2368] + - [13, 24378.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 5056] + - [16, 23527.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 448] + - [20, 19213.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 704] + - [15, 22162.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3584] + - [4, 24867.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 6784] + - [13, 25314.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 5056] + - [4, 22241.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 2944] + - [2, 24036.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 3584] + - [7, 21831.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 2368] + - [21, 23742.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 1856] + - [11, 19288.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 1408] + - [7, 24014.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 5056] + - [15, 24657.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 6784] + - [21, 25317.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 1408] + - [2, 24328.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [12, 16975.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 1024] + - [20, 18005.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 5056] + - [21, 22761.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 5056] + - [21, 21594.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 1408] + - [2, 22528.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 448] + - [26, 18163.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 5056] + - [24, 25097.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 6784] + - [13, 24673.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 1408] + - [7, 23328.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 4288] + - [27, 23842.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 5888] + - [4, 24345.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 1024] + - [3, 18731.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 6784] + - [13, 24042.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 3584] + - [28, 23335.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1856] + - [13, 23632.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 1024] + - [2, 20117.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 3584] + - [2, 25016.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 2944] + - [7, 20299.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 6784] + - [13, 19406.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1856] + - [21, 22288.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3584] + - [13, 24023.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 5888] + - [13, 25091.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 3584] + - [13, 24308.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 5888] + - [21, 23804.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 448] + - [4, 20695.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 1408] + - [7, 21080.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 2368] + - [2, 23428.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 5056] + - [16, 23647.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 2368] + - [21, 23180.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 2944] + - [16, 21085.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1408] + - [7, 20885.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 4288] + - [9, 22626.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 5056] + - [24, 24365.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 1856] + - [2, 22374.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 3584] + - [21, 19702.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 6784] + - [2, 23907.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 2944] + - [2, 24572.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 2944] + - [27, 24393.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 4288] + - [27, 23575.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 1024] + - [9, 24304.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 5888] + - [15, 25438.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 5888] + - [5, 19123.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 1408] + - [7, 20767.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 2944] + - [2, 21601.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 5888] + - [24, 24595.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1856] + - [2, 21690.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 4288] + - [21, 24728.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 704] + - [15, 21924.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 448] + - [7, 15324.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 5056] + - [15, 23850.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1024] + - [4, 22380.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 4288] + - [7, 22244.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 2368] + - [4, 22425.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 5888] + - [13, 24732.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 6784] + - [21, 25140.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 6784] + - [2, 20409.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 3584] + - [5, 20706.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 5888] + - [7, 20595.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 2368] + - [2, 21903.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 1856] + - [1, 16471.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3584] + - [13, 25094.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 5888] + - [2, 24024.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 2368] + - [21, 21349.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 2944] + - [2, 22673.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [0, 17727.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 1408] + - [27, 22383.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 6784] + - [22, 25420.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 4288] + - [2, 19704.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 448] + - [21, 20354.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 1024] + - [30, 18149.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [36, 10506.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 6784] + - [31, 19105.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 4288] + - [30, 18733.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 1856] + - [44, 17953.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1024] + - [29, 14600.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 128] + - [47, 7592.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1856] + - [45, 16209.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 704] + - [36, 13039.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 3584] + - [33, 15292.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [36, 13748.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 64] + - [41, 13424.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 256] + - [45, 17353.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 4288] + - [49, 18319.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 448] + - [37, 16357.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 128] + - [46, 17699.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 128] + - [42, 15562.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 128] + - [49, 9067.0] + - - [448, 1408, 1, 256, 448, 448, 256, 1408] + - [29, 12836.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [40, 15095.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 256] + - [31, 16920.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3584] + - [46, 19805.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 64] + - [34, 13241.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 704] + - [45, 11171.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 128] + - [35, 15413.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 64] + - [34, 8932.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 256] + - [33, 15562.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [44, 14495.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 1408] + - [40, 13252.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 128] + - [42, 18746.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 704] + - [31, 15304.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 256] + - [38, 19766.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3584] + - [30, 16796.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 2944] + - [33, 12980.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1856] + - [44, 17804.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 128] + - [36, 10618.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 1408] + - [36, 16965.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [46, 15173.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 2944] + - [45, 17428.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 704] + - [48, 13440.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 2944] + - [29, 18566.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [36, 7455.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 2368] + - [29, 16625.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 4288] + - [38, 16459.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [37, 10572.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 448] + - [37, 16187.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 704] + - [44, 17758.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1024] + - [37, 17702.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [48, 13361.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 6784] + - [31, 18257.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 704] + - [45, 17001.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 5888] + - [49, 16948.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 1408] + - [44, 18298.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 128] + - [34, 13773.0] + - - [704, 448, 1, 256, 704, 704, 256, 448] + - [40, 8464.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 1856] + - [34, 17147.0] + - - [128, 4288, 1, 256, 128, 128, 256, 4288] + - [49, 9624.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [44, 15457.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 256] + - [45, 16208.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 704] + - [36, 14775.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 448] + - [45, 17596.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [33, 10305.0] + - - [704, 1856, 1, 256, 704, 704, 256, 1856] + - [29, 14801.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 64] + - [34, 12029.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 2368] + - [48, 16377.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 448] + - [36, 14888.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 128] + - [44, 12872.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [44, 13301.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 448] + - [36, 14901.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 64] + - [49, 14656.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 448] + - [30, 16877.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 256] + - [29, 11752.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 448] + - [36, 17834.0] + - - [128, 5056, 1, 256, 128, 128, 256, 5056] + - [36, 13107.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 256] + - [35, 17385.0] + - - [704, 704, 1, 256, 704, 704, 256, 704] + - [36, 9037.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 128] + - [42, 16600.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1408] + - [37, 11800.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 64] + - [49, 15346.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 2944] + - [37, 13561.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 448] + - [42, 17695.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 448] + - [45, 17828.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 64] + - [40, 6858.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 5056] + - [46, 19254.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 64] + - [49, 8420.0] + - - [128, 2368, 1, 256, 128, 128, 256, 2368] + - [33, 6521.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [50, 13530.0] + - - [128, 2944, 1, 256, 128, 128, 256, 2944] + - [39, 8691.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 128] + - [37, 16173.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 448] + - [37, 15709.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 128] + - [37, 17547.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 704] + - [36, 18043.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1408] + - [40, 16272.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1408] + - [44, 17428.0] + - - [448, 2944, 1, 256, 448, 448, 256, 2944] + - [29, 15073.0] + - - [448, 2368, 1, 256, 448, 448, 256, 2368] + - [29, 13261.0] + - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [33, 6801.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 128] + - [35, 19344.0] + - - [448, 704, 1, 256, 448, 448, 256, 704] + - [48, 6842.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 256] + - [44, 16471.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 128] + - [44, 13502.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [44, 14915.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1024] + - [44, 15849.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [37, 16022.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 2368] + - [38, 18485.0] + - - [128, 3584, 1, 256, 128, 128, 256, 3584] + - [32, 10230.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 448] + - [33, 13816.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 704] + - [49, 16200.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [44, 9608.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1856] + - [36, 17796.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 256] + - [36, 13839.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 128] + - [33, 13089.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 448] + - [50, 13326.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 256] + - [45, 18157.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [37, 13841.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 448] + - [37, 13065.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 64] + - [34, 14587.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 1024] + - [29, 16339.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 448] + - [44, 17933.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 128] + - [36, 13066.0] + - - [448, 1024, 1, 256, 448, 448, 256, 1024] + - [43, 10915.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 5056] + - [35, 18047.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [39, 7639.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 5888] + - [37, 17454.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 128] + - [44, 14702.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 128] + - [40, 9507.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 256] + - [46, 18274.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 128] + - [49, 16186.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 3584] + - [31, 19297.0] + - - [128, 5888, 1, 256, 128, 128, 256, 5888] + - [41, 13549.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] + - [30, 15322.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [34, 17330.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [29, 13022.0] + - - [704, 1024, 1, 256, 704, 704, 256, 1024] + - [29, 12996.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 704] + - [33, 14564.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 2368] + - [37, 13729.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 256] + - [50, 19573.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 128] + - [42, 18214.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 1856] + - [38, 17666.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 448] + - [41, 10874.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 128] + - [29, 12640.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 2944] + - [49, 16296.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 1024] + - [37, 16798.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 448] + - [46, 16299.0] + - - [128, 6784, 1, 256, 128, 128, 256, 6784] + - [48, 13555.0] + - - [704, 1408, 1, 256, 704, 704, 256, 1408] + - [30, 14129.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [32, 9358.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 2944] + - [44, 17937.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [43, 10119.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 2368] + - [35, 15132.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 2368] + - [38, 17409.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 448] + - [35, 12685.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 704] + - [37, 16663.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 4288] + - [42, 15326.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 704] + - [29, 15468.0] + - - [448, 1856, 1, 256, 448, 448, 256, 1856] + - [29, 14620.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 704] + - [44, 18193.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 64] + - [61, 8241.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 64] + - [84, 4685.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 64] + - [74, 4862.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 256] + - [79, 8487.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 64] + - [61, 6091.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 128] + - [72, 4955.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 64] + - [85, 8654.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 128] + - [61, 6270.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 64] + - [85, 7269.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [65, 9298.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [77, 6404.0] + - - [448, 448, 1, 256, 448, 448, 256, 448] + - [65, 5286.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 1024] + - [58, 7137.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [64, 5623.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 128] + - [61, 6738.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 256] + - [79, 6274.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 704] + - [55, 4570.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 128] + - [85, 9473.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [61, 3572.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 448] + - [65, 8524.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 128] + - [74, 6974.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1856] + - [74, 8763.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [76, 5153.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 448] + - [65, 6200.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [58, 7148.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 64] + - [61, 4459.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 64] + - [74, 8285.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1024] + - [85, 6284.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 64] + - [61, 3603.0] + - - [704, 128, 1, 256, 704, 704, 256, 128] + - [84, 2876.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 256] + - [74, 5798.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [65, 5846.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 128] + - [61, 9013.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [61, 5780.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [58, 3503.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1024] + - [65, 8054.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 64] + - [85, 8676.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 128] + - [74, 7430.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [68, 7616.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [76, 5454.0] + - - [128, 1024, 1, 256, 128, 128, 256, 1024] + - [74, 3976.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [61, 4550.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [81, 4339.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 128] + - [71, 4910.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 128] + - [84, 4776.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [79, 6681.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 256] + - [65, 7081.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 448] + - [71, 5793.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [58, 7896.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 704] + - [65, 7676.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [53, 7349.0] + - - [128, 1408, 1, 256, 128, 128, 256, 1408] + - [79, 4747.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 64] + - [56, 2862.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [58, 4379.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 128] + - [61, 6984.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 64] + - [85, 7410.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 64] + - [85, 5944.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [58, 7234.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 64] + - [76, 5095.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 448] + - [79, 7837.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 256] + - [61, 8490.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 64] + - [74, 9415.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 1024] + - [53, 8291.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 64] + - [85, 6349.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [84, 3625.0] + - - [128, 704, 1, 256, 128, 128, 256, 704] + - [84, 2779.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 128] + - [61, 4082.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [65, 8461.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 64] + - [85, 5849.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [64, 6468.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 64] + - [61, 7819.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 1408] + - [79, 7631.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 704] + - [79, 4868.0] + - - [128, 1856, 1, 256, 128, 128, 256, 1856] + - [85, 5905.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [76, 6217.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 256] + - [79, 7674.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 704] + - [79, 7085.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [52, 2786.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1408] + - [79, 7103.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 1856] + - [74, 9574.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [61, 4766.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 64] + - [54, 3794.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 64] + - [62, 2421.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [58, 4140.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [66, 3145.0] + - - [128, 448, 1, 256, 128, 128, 256, 448] + - [73, 2198.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 256] + - [58, 4823.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [75, 2457.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [69, 493.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 64] + - [62, 1475.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 448] + - [70, 3645.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 64] + - [80, 3055.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 64] + - [62, 2876.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [59, 982.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 128] + - [83, 1809.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [51, 1520.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [54, 2742.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [62, 668.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 128] + - [86, 2580.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [87, 761.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [70, 2202.0] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [54, 1677.0] + - - [448, 128, 1, 256, 448, 448, 256, 128] + - [57, 2079.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [60, 2190.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 128] + - [58, 4211.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 256] + - [62, 2731.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [75, 1483.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [63, 1180.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [75, 272.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 256] + - [54, 3157.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [69, 762.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 128] + - [75, 1462.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [54, 1295.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [75, 343.0] + - - [704, 64, 1, 256, 704, 704, 256, 64] + - [54, 1625.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [87, 394.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [69, 974.0] + - - [448, 64, 1, 256, 448, 448, 256, 64] + - [66, 1739.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 64] + - [66, 3007.0] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [83, 490.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 64] + - [74, 4328.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [58, 4806.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 128] + - [60, 3645.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 64] + - [64, 4733.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [67, 1760.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 256] + - [82, 4068.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 128] + - [67, 3219.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [62, 670.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [80, 3572.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 448] + - [81, 4222.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 64] + - [78, 1802.0] + - - [128, 128, 1, 256, 128, 128, 256, 128] + - [67, 701.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH.yaml new file mode 100644 index 000000000..a76a141e3 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH.yaml @@ -0,0 +1,10305 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x32x32_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT32x64x32_SN_SU0_SUM0_TT2_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x32x32_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x32x16_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_MT16x32x32_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [3, 9500.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [0, 12571.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [0, 13448.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [0, 11564.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [5, 8485.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [1, 11861.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [4, 11990.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [2, 11094.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [7, 7608.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [6, 14506.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [13, 4978.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [35, 3479.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [32, 7062.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [27, 7217.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [18, 4458.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [34, 4333.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [11, 6458.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [27, 7670.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [29, 5718.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [20, 5606.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [23, 5128.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [11, 6379.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [23, 6704.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [11, 5984.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [34, 2891.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [32, 4330.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [23, 5271.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [23, 7105.0] + - - [1, 2944, 1, 3328, 1, 1, 3328, 2944] + - [41, 281.0] + - - [1, 2368, 1, 1280, 1, 1, 1280, 2368] + - [42, 222.0] + - - [1, 1408, 1, 1280, 1, 1, 1280, 1408] + - [39, 129.0] + - - [1, 2368, 1, 3328, 1, 1, 3328, 2368] + - [42, 229.0] + - - [1, 3584, 1, 1280, 1, 1, 1280, 3584] + - [41, 250.0] + - - [1, 2944, 1, 1, 1, 1, 1, 2944] + - [8, 1.0] + - - [1, 1408, 1, 3328, 1, 1, 3328, 1408] + - [39, 166.0] + - - [1, 2944, 1, 256, 1, 1, 256, 2944] + - [41, 111.0] + - - [1, 5056, 1, 256, 1, 1, 256, 5056] + - [41, 166.0] + - - [1, 1856, 1, 256, 1, 1, 256, 1856] + - [28, 76.0] + - - [1, 1856, 1, 1280, 1, 1, 1280, 1856] + - [44, 167.0] + - - [1, 4288, 1, 1280, 1, 1, 1280, 4288] + - [38, 251.0] + - - [1, 1408, 1, 1, 1, 1, 1, 1408] + - [9, 0.35] + - - [1, 1408, 1, 256, 1, 1, 256, 1408] + - [28, 60.0] + - - [1, 2368, 1, 256, 1, 1, 256, 2368] + - [33, 93.0] + - - [1, 4288, 1, 1, 1, 1, 1, 4288] + - [8, 1.0] + - - [1, 1856, 1, 3328, 1, 1, 3328, 1856] + - [42, 202.0] + - - [1, 4288, 1, 3328, 1, 1, 3328, 4288] + - [43, 300.0] + - - [1, 2368, 1, 1, 1, 1, 1, 2368] + - [8, 1.0] + - - [1, 3584, 1, 3328, 1, 1, 3328, 3584] + - [41, 301.0] + - - [1, 5056, 1, 1, 1, 1, 1, 5056] + - [8, 1.0] + - - [1, 3584, 1, 256, 1, 1, 256, 3584] + - [21, 140.0] + - - [1, 5056, 1, 1280, 1, 1, 1280, 5056] + - [38, 280.0] + - - [1, 3584, 1, 1, 1, 1, 1, 3584] + - [8, 1.0] + - - [1, 2944, 1, 1280, 1, 1, 1280, 2944] + - [42, 232.0] + - - [1, 1856, 1, 1, 1, 1, 1, 1856] + - [40, 1.0] + - - [1, 4288, 1, 256, 1, 1, 256, 4288] + - [12, 178.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [33, 3126.0] + - - [1, 64, 1, 1280, 1, 1, 1280, 64] + - [15, 7.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [22, 405.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [21, 971.0] + - - [1, 128, 1, 1, 1, 1, 1, 128] + - [8, 0.03] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [25, 4497.0] + - - [1, 64, 1, 3328, 1, 1, 3328, 64] + - [15, 9.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [36, 2603.0] + - - [1, 256, 1, 1280, 1, 1, 1280, 256] + - [21, 28.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [31, 489.0] + - - [1, 1, 1, 3328, 1, 1, 3328, 1] + - [9, 0.11] + - - [1, 64, 1, 1, 1, 1, 1, 64] + - [9, 0.02] + - - [1, 128, 1, 1280, 1, 1, 1280, 128] + - [15, 14.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [29, 3946.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [15, 1730.0] + - - [1, 1024, 1, 256, 1, 1, 256, 1024] + - [19, 45.0] + - - [1, 704, 1, 3328, 1, 1, 3328, 704] + - [26, 96.0] + - - [1, 256, 1, 1, 1, 1, 1, 256] + - [17, 0.07] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [28, 369.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [31, 1192.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [16, 1912.0] + - - [1, 1, 1, 256, 1, 1, 256, 1] + - [9, 0.04] + - - [1, 704, 1, 1, 1, 1, 1, 704] + - [10, 0.18] + - - [1, 704, 1, 1280, 1, 1, 1280, 704] + - [37, 74.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [14, 1215.0] + - - [1, 448, 1, 1280, 1, 1, 1280, 448] + - [31, 50.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [31, 975.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [22, 0.0002659574473513254] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [19, 1271.0] + - - [1, 448, 1, 3328, 1, 1, 3328, 448] + - [21, 63.0] + - - [1, 128, 1, 256, 1, 1, 256, 128] + - [19, 6.0] + - - [1, 1024, 1, 3328, 1, 1, 3328, 1024] + - [15, 121.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [25, 2237.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [30, 744.0] + - - [1, 1024, 1, 1280, 1, 1, 1280, 1024] + - [15, 96.0] + - - [1, 1, 1, 1280, 1, 1, 1280, 1] + - [9, 0.09] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [24, 1758.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [30, 316.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [37, 1363.0] + - - [1, 448, 1, 256, 1, 1, 256, 448] + - [28, 35.0] + - - [1, 1024, 1, 1, 1, 1, 1, 1024] + - [12, 1.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [12, 4188.0] + - - [1, 256, 1, 256, 1, 1, 256, 256] + - [19, 19.0] + - - [1, 704, 1, 256, 1, 1, 256, 704] + - [28, 54.0] + - - [1, 128, 1, 3328, 1, 1, 3328, 128] + - [26, 20.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [21, 681.0] + - - [1, 448, 1, 1, 1, 1, 1, 448] + - [9, 0.27] + - - [1, 64, 1, 256, 1, 1, 256, 64] + - [19, 5.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [29, 3255.0] + - - [1, 256, 1, 3328, 1, 1, 3328, 256] + - [21, 36.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH_GB.yaml new file mode 100644 index 000000000..8a5339cb4 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HBH_GB.yaml @@ -0,0 +1,12089 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x8_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x32_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x16_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x32_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x16_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x32_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HBH_GB_MT16x32x32_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [5, 9392.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [5, 12549.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [4, 13401.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [0, 12087.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [5, 8472.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [0, 11943.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [0, 11262.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [3, 11140.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [1, 7490.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [2, 14546.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [25, 4993.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [14, 4707.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [9, 7032.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [37, 7217.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [37, 4393.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [25, 4353.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [9, 6441.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [23, 7849.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [11, 5558.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [39, 5193.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [16, 5121.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [32, 6385.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [32, 6694.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [32, 5973.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [35, 2841.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [16, 4226.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [23, 5252.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [32, 7055.0] + - - [1, 2944, 1, 3328, 1, 1, 3328, 2944] + - [46, 274.0] + - - [1, 2368, 1, 1280, 1, 1, 1280, 2368] + - [45, 223.0] + - - [1, 1408, 1, 1280, 1, 1, 1280, 1408] + - [52, 129.0] + - - [1, 2368, 1, 3328, 1, 1, 3328, 2368] + - [46, 235.0] + - - [1, 3584, 1, 1280, 1, 1, 1280, 3584] + - [44, 250.0] + - - [1, 2944, 1, 1, 1, 1, 1, 2944] + - [6, 1.0] + - - [1, 1408, 1, 3328, 1, 1, 3328, 1408] + - [50, 164.0] + - - [1, 2944, 1, 256, 1, 1, 256, 2944] + - [10, 108.0] + - - [1, 5056, 1, 256, 1, 1, 256, 5056] + - [44, 170.0] + - - [1, 1856, 1, 256, 1, 1, 256, 1856] + - [52, 99.0] + - - [1, 1856, 1, 1280, 1, 1, 1280, 1856] + - [45, 161.0] + - - [1, 4288, 1, 1280, 1, 1, 1280, 4288] + - [48, 255.0] + - - [1, 1408, 1, 1, 1, 1, 1, 1408] + - [27, 1.0] + - - [1, 1408, 1, 256, 1, 1, 256, 1408] + - [17, 63.0] + - - [1, 2368, 1, 256, 1, 1, 256, 2368] + - [24, 94.0] + - - [1, 4288, 1, 1, 1, 1, 1, 4288] + - [6, 1.0] + - - [1, 1856, 1, 3328, 1, 1, 3328, 1856] + - [47, 200.0] + - - [1, 4288, 1, 3328, 1, 1, 3328, 4288] + - [51, 302.0] + - - [1, 2368, 1, 1, 1, 1, 1, 2368] + - [6, 1.0] + - - [1, 3584, 1, 3328, 1, 1, 3328, 3584] + - [49, 316.0] + - - [1, 5056, 1, 1, 1, 1, 1, 5056] + - [8, 3.0] + - - [1, 3584, 1, 256, 1, 1, 256, 3584] + - [44, 189.0] + - - [1, 5056, 1, 1280, 1, 1, 1280, 5056] + - [44, 313.0] + - - [1, 3584, 1, 1, 1, 1, 1, 3584] + - [43, 2.0] + - - [1, 2944, 1, 1280, 1, 1, 1280, 2944] + - [51, 250.0] + - - [1, 1856, 1, 1, 1, 1, 1, 1856] + - [43, 1.0] + - - [1, 4288, 1, 256, 1, 1, 256, 4288] + - [48, 211.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [10, 3155.0] + - - [1, 64, 1, 1280, 1, 1, 1280, 64] + - [29, 9.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [19, 659.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [21, 950.0] + - - [1, 128, 1, 1, 1, 1, 1, 128] + - [42, 0.05] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [18, 4627.0] + - - [1, 64, 1, 3328, 1, 1, 3328, 64] + - [29, 10.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [40, 2760.0] + - - [1, 256, 1, 1280, 1, 1, 1280, 256] + - [21, 28.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [41, 492.0] + - - [1, 1, 1, 3328, 1, 1, 3328, 1] + - [7, 0.12] + - - [1, 64, 1, 1, 1, 1, 1, 64] + - [31, 0.03] + - - [1, 128, 1, 1280, 1, 1, 1280, 128] + - [30, 15.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [20, 4080.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [28, 1689.0] + - - [1, 1024, 1, 256, 1, 1, 256, 1024] + - [6, 50.0] + - - [1, 704, 1, 3328, 1, 1, 3328, 704] + - [29, 97.0] + - - [1, 256, 1, 1, 1, 1, 1, 256] + - [22, 0.07] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [17, 369.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [29, 1208.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [13, 2097.0] + - - [1, 1, 1, 256, 1, 1, 256, 1] + - [7, 0.04] + - - [1, 704, 1, 1, 1, 1, 1, 704] + - [7, 0.18] + - - [1, 704, 1, 1280, 1, 1, 1280, 704] + - [21, 74.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [28, 1219.0] + - - [1, 448, 1, 1280, 1, 1, 1280, 448] + - [21, 49.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [29, 925.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [7, 0.00025125627902704875] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [33, 1178.0] + - - [1, 448, 1, 3328, 1, 1, 3328, 448] + - [41, 70.0] + - - [1, 128, 1, 256, 1, 1, 256, 128] + - [17, 6.0] + - - [1, 1024, 1, 3328, 1, 1, 3328, 1024] + - [26, 120.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [35, 2249.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [17, 731.0] + - - [1, 1024, 1, 1280, 1, 1, 1280, 1024] + - [34, 109.0] + - - [1, 1, 1, 1280, 1, 1, 1280, 1] + - [15, 0.11] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [15, 2586.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [17, 192.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [21, 1256.0] + - - [1, 448, 1, 256, 1, 1, 256, 448] + - [36, 27.0] + - - [1, 1024, 1, 1, 1, 1, 1, 1024] + - [8, 0.26] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [38, 3855.0] + - - [1, 256, 1, 256, 1, 1, 256, 256] + - [38, 12.0] + - - [1, 704, 1, 256, 1, 1, 256, 704] + - [17, 32.0] + - - [1, 128, 1, 3328, 1, 1, 3328, 128] + - [12, 18.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [29, 613.0] + - - [1, 448, 1, 1, 1, 1, 1, 448] + - [17, 0.24] + - - [1, 64, 1, 256, 1, 1, 256, 64] + - [17, 5.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [39, 3809.0] + - - [1, 256, 1, 3328, 1, 1, 3328, 256] + - [29, 40.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB_GB.yaml new file mode 100644 index 000000000..5343f84aa --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_HB_GB.yaml @@ -0,0 +1,19698 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 1 + LVCB: 16 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 1 + LVCB: 8 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 128 + LSPB: 16 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 1 + LVCB: 8 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 64 + LSPB: 16 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 4288] + - [26, 24113.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 5888] + - [11, 23265.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 5056] + - [4, 24511.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1024] + - [11, 23755.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 1856] + - [2, 23916.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 704] + - [17, 17916.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 2944] + - [2, 25144.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 4288] + - [23, 21546.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 5056] + - [26, 24844.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 5888] + - [14, 24264.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3584] + - [4, 19546.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1408] + - [11, 24152.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 1856] + - [2, 23965.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 6784] + - [11, 24926.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 5056] + - [19, 24722.0] + - - [448, 5056, 1, 256, 448, 448, 256, 5056] + - [17, 17487.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 448] + - [22, 18958.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 704] + - [26, 21749.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 1024] + - [6, 21132.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 4288] + - [11, 24707.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 2368] + - [2, 21897.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 2944] + - [23, 25098.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 1024] + - [6, 22908.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 2944] + - [2, 22052.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 5056] + - [14, 25202.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 5056] + - [11, 23297.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 3584] + - [14, 22824.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 2944] + - [11, 23697.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 4288] + - [23, 23636.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 4288] + - [14, 24480.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 704] + - [9, 19290.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 4288] + - [26, 24763.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 2368] + - [23, 23910.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 448] + - [2, 19447.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 1408] + - [2, 20051.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 2944] + - [17, 22449.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 2368] + - [23, 23927.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 3584] + - [4, 18914.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 5888] + - [14, 24909.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 1408] + - [29, 17559.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 2368] + - [2, 22360.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 704] + - [26, 22270.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 1856] + - [22, 19302.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 1856] + - [17, 22374.0] + - - [704, 5888, 1, 256, 704, 704, 256, 5888] + - [5, 20014.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 6784] + - [11, 24746.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 704] + - [2, 20461.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 1408] + - [22, 17745.0] + - - [448, 4288, 1, 256, 448, 448, 256, 4288] + - [25, 15359.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 2368] + - [0, 18804.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 2368] + - [2, 21361.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1024] + - [2, 19276.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 704] + - [22, 19358.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 3584] + - [17, 22121.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 4288] + - [14, 24778.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1856] + - [2, 23736.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 1024] + - [14, 24630.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 3584] + - [2, 23750.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3584] + - [11, 24584.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 2944] + - [23, 24433.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 2368] + - [1, 18879.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 2368] + - [2, 23614.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 2368] + - [23, 19878.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 6784] + - [11, 25060.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 1856] + - [2, 24138.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 4288] + - [26, 24607.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 5056] + - [14, 24332.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 5888] + - [14, 25529.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 5056] + - [30, 23999.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 5056] + - [14, 23907.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 1024] + - [5, 21289.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 1408] + - [2, 19804.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 448] + - [22, 19405.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 5888] + - [14, 22506.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 6784] + - [2, 23820.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 2944] + - [28, 24366.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 1024] + - [0, 22493.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 5056] + - [30, 24791.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 1856] + - [4, 21623.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 2368] + - [28, 20554.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 4288] + - [4, 24487.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 6784] + - [23, 22451.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 5888] + - [14, 25566.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1024] + - [5, 24197.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 5888] + - [19, 19931.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 5888] + - [14, 25165.0] + - - [448, 6784, 1, 256, 448, 448, 256, 6784] + - [25, 17878.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 5888] + - [6, 24390.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 3584] + - [6, 23661.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 2944] + - [2, 25299.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 5056] + - [14, 25103.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 2368] + - [23, 24232.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 5888] + - [23, 24649.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 4288] + - [23, 23514.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1856] + - [2, 23223.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 2944] + - [14, 23270.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 6784] + - [27, 20994.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 1024] + - [21, 21915.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1408] + - [11, 20471.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 1856] + - [6, 23944.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 1408] + - [7, 18450.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 256] + - [27, 22609.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 3584] + - [0, 23939.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1024] + - [5, 22144.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1856] + - [4, 21207.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 6784] + - [23, 25325.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 1024] + - [5, 22980.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 1024] + - [26, 23192.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 5888] + - [14, 25264.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 1024] + - [3, 19140.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 1408] + - [2, 24623.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 1024] + - [4, 23884.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 1408] + - [2, 24007.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 4288] + - [14, 24503.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 2944] + - [6, 23644.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 1856] + - [23, 20084.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3584] + - [26, 24265.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 6784] + - [17, 23423.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1408] + - [2, 24308.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 5888] + - [14, 24699.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 5056] + - [14, 22072.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 6784] + - [23, 24845.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 4288] + - [23, 23428.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1856] + - [11, 23778.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 5056] + - [6, 24291.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 448] + - [22, 16929.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 3584] + - [26, 24877.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [25, 18618.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3584] + - [23, 24089.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 256] + - [7, 21361.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 3584] + - [6, 23905.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 4288] + - [6, 24725.0] + - - [704, 5056, 1, 256, 704, 704, 256, 5056] + - [21, 19224.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 2368] + - [11, 23876.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 3584] + - [23, 24540.0] + - - [704, 6784, 1, 256, 704, 704, 256, 6784] + - [2, 20122.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3584] + - [4, 23331.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 2944] + - [17, 24609.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 6784] + - [28, 24155.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 4288] + - [11, 24065.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 256] + - [19, 20960.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 704] + - [14, 20244.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 6784] + - [6, 24492.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 1856] + - [17, 22468.0] + - - [704, 4288, 1, 256, 704, 704, 256, 4288] + - [23, 18261.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 6784] + - [14, 24439.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 2368] + - [11, 24207.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 4288] + - [11, 23230.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 3584] + - [26, 24007.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1408] + - [11, 19982.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 6784] + - [23, 24334.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 704] + - [26, 21683.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 4288] + - [14, 23625.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 5888] + - [14, 23602.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 704] + - [23, 20221.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 448] + - [30, 19378.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 2368] + - [4, 20268.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 5056] + - [14, 21080.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 448] + - [16, 17159.0] + - - [448, 5888, 1, 256, 448, 448, 256, 5888] + - [3, 17540.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 2368] + - [23, 23253.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 704] + - [14, 22699.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 2944] + - [2, 23942.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 704] + - [16, 16935.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 2368] + - [17, 22380.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 5056] + - [14, 25159.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3584] + - [26, 24979.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 6784] + - [17, 24839.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 2944] + - [23, 24320.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 704] + - [6, 20127.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 5056] + - [27, 21262.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 5888] + - [14, 25549.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 4288] + - [23, 24687.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 1024] + - [4, 23290.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1856] + - [11, 20841.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 448] + - [19, 19754.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 5888] + - [0, 22180.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 6784] + - [14, 24290.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 2944] + - [2, 20830.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 2944] + - [23, 24935.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 1408] + - [2, 20750.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 4288] + - [11, 19417.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 704] + - [22, 18810.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 1408] + - [2, 23940.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1024] + - [15, 21926.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 6784] + - [17, 23017.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 448] + - [2, 19866.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 3584] + - [8, 24821.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 1024] + - [23, 24203.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 6784] + - [12, 21317.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 2944] + - [26, 22636.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 5056] + - [14, 21925.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 5888] + - [2, 23975.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 1856] + - [2, 21181.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 5056] + - [17, 23919.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 5056] + - [26, 25260.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 6784] + - [23, 25053.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 1856] + - [6, 21860.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 5888] + - [11, 23003.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 4288] + - [4, 24676.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1408] + - [23, 23649.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 2368] + - [2, 22405.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 5056] + - [14, 24973.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 2368] + - [2, 24329.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 1856] + - [2, 23419.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 2944] + - [14, 22883.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1024] + - [26, 23016.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 4288] + - [17, 21035.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3584] + - [23, 25142.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3584] + - [26, 25206.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1408] + - [6, 22101.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 2944] + - [11, 24804.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 6784] + - [2, 22989.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 2944] + - [28, 23678.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1408] + - [6, 23685.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 6784] + - [12, 25136.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 4288] + - [17, 23173.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 6784] + - [11, 24248.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 1408] + - [2, 21989.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 5888] + - [26, 24885.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1024] + - [0, 23337.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 6784] + - [23, 20628.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1408] + - [2, 23721.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 1856] + - [2, 23081.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 5888] + - [23, 25085.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1856] + - [2, 24091.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 256] + - [6, 21328.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 5888] + - [2, 24028.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 1408] + - [6, 22381.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3584] + - [23, 20738.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 448] + - [2, 20051.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 704] + - [10, 18488.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 1024] + - [21, 20994.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 4288] + - [2, 23675.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 5056] + - [2, 24219.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 5056] + - [14, 25167.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 5888] + - [6, 24037.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 6784] + - [2, 24645.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 2368] + - [2, 22275.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 3584] + - [23, 23799.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 3584] + - [14, 25092.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [13, 19028.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 1408] + - [11, 21418.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 4288] + - [4, 23755.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 2368] + - [23, 24147.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1856] + - [23, 21842.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 1856] + - [11, 22851.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 5888] + - [2, 22853.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 2368] + - [28, 19878.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 704] + - [23, 21461.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 2944] + - [2, 24506.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 256] + - [19, 21410.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 5056] + - [23, 22872.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [7, 18956.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 4288] + - [17, 22023.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 2368] + - [23, 24316.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 5888] + - [14, 24511.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 256] + - [6, 20809.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 1856] + - [2, 22158.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 704] + - [22, 18735.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 6784] + - [11, 24284.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 4288] + - [4, 23358.0] + - - [704, 3584, 1, 256, 704, 704, 256, 3584] + - [20, 18257.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 2944] + - [2, 21529.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 5056] + - [26, 24178.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 5056] + - [26, 24990.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 1024] + - [26, 22879.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 6784] + - [2, 23499.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 1408] + - [23, 24505.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 2368] + - [8, 21672.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 2944] + - [23, 25255.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 448] + - [22, 19414.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 6784] + - [23, 25327.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 5056] + - [0, 20264.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 704] + - [29, 19608.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 5888] + - [2, 21372.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 4288] + - [11, 24049.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [13, 18768.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1024] + - [26, 24250.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 704] + - [23, 20912.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 3584] + - [23, 25089.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 2944] + - [2, 23701.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 2368] + - [23, 22567.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 3584] + - [6, 21411.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 2944] + - [17, 22364.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 5888] + - [6, 24574.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 1856] + - [17, 21944.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 4288] + - [23, 20589.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 2944] + - [11, 24176.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 5056] + - [26, 24852.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 5056] + - [27, 22056.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 2368] + - [28, 22907.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 704] + - [2, 20547.0] + - - [448, 3584, 1, 256, 448, 448, 256, 3584] + - [18, 16548.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 5888] + - [23, 25195.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 3584] + - [28, 24511.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 1856] + - [23, 21508.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1408] + - [6, 24360.0] + - - [704, 2944, 1, 256, 704, 704, 256, 2944] + - [18, 18449.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 5888] + - [11, 25285.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 6784] + - [26, 24616.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 4288] + - [23, 20000.0] + - - [704, 2368, 1, 256, 704, 704, 256, 2368] + - [13, 17071.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 2368] + - [11, 24438.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 5056] + - [17, 23549.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 448] + - [22, 19337.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 704] + - [26, 22479.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3584] + - [4, 24917.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 6784] + - [23, 25387.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 5056] + - [26, 22380.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 2944] + - [23, 24163.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 3584] + - [6, 22613.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 2368] + - [23, 23851.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 1856] + - [15, 19356.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 1408] + - [2, 24174.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 5056] + - [26, 24717.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 6784] + - [24, 25384.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 1408] + - [23, 24438.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [22, 17969.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 1024] + - [25, 18345.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 5056] + - [28, 22791.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 5056] + - [17, 21907.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 1408] + - [2, 22555.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 448] + - [22, 18613.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 5056] + - [26, 25165.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 6784] + - [23, 24717.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 1408] + - [23, 23373.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 4288] + - [23, 23967.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 5888] + - [30, 24423.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 1024] + - [5, 19472.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 6784] + - [23, 24095.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 3584] + - [8, 24004.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1856] + - [11, 23747.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 1024] + - [23, 20595.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 3584] + - [11, 25099.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 2944] + - [2, 20624.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 6784] + - [23, 20251.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1856] + - [6, 22801.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3584] + - [23, 24115.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 5888] + - [23, 25151.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 3584] + - [11, 24432.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 5888] + - [6, 23902.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 448] + - [14, 20809.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 1408] + - [17, 21899.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 2368] + - [6, 23511.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 5056] + - [11, 23320.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 2368] + - [23, 23372.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 2944] + - [2, 21870.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1408] + - [2, 20787.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 4288] + - [23, 23000.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 5056] + - [26, 24459.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 1856] + - [17, 22525.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 3584] + - [2, 20312.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 6784] + - [11, 24177.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 2944] + - [2, 24637.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 2944] + - [23, 24525.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 4288] + - [28, 23708.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 1024] + - [26, 24440.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 5888] + - [26, 25512.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 5888] + - [9, 19427.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 1408] + - [6, 22265.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 2944] + - [6, 22207.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 5888] + - [14, 24650.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1856] + - [6, 21754.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 4288] + - [23, 24783.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 704] + - [26, 22169.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 448] + - [16, 17864.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 5056] + - [14, 23934.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1024] + - [14, 22534.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 4288] + - [2, 22393.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 2368] + - [26, 22519.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 5888] + - [23, 24793.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 6784] + - [11, 25226.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 6784] + - [2, 20549.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 3584] + - [0, 21826.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 5888] + - [2, 20944.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 2368] + - [2, 22203.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 1856] + - [29, 18713.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3584] + - [23, 25152.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 5888] + - [2, 24125.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 2368] + - [2, 22062.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 2944] + - [2, 23579.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [15, 18634.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 1408] + - [6, 23374.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 6784] + - [24, 25475.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 4288] + - [23, 19967.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 448] + - [2, 20571.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 1024] + - [31, 19448.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [31, 10876.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 6784] + - [33, 19312.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 4288] + - [45, 19163.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 1856] + - [37, 18777.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1024] + - [41, 15870.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 128] + - [34, 8838.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1856] + - [38, 16599.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 704] + - [44, 14408.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 3584] + - [45, 16079.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [37, 16058.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 64] + - [34, 14003.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 256] + - [45, 17849.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 4288] + - [32, 18715.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 448] + - [45, 17484.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 128] + - [43, 18655.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 128] + - [33, 15901.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 128] + - [38, 9608.0] + - - [448, 1408, 1, 256, 448, 448, 256, 1408] + - [37, 13547.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [31, 15992.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 256] + - [39, 17508.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3584] + - [33, 20403.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 64] + - [34, 14564.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 704] + - [38, 13630.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 128] + - [46, 15852.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 64] + - [34, 9804.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 256] + - [44, 16296.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [48, 15052.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 1408] + - [37, 14470.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 128] + - [36, 19209.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 704] + - [33, 15558.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 256] + - [39, 20308.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3584] + - [45, 17047.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 2944] + - [31, 13317.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1856] + - [37, 18125.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 128] + - [42, 11271.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 1408] + - [37, 17482.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [46, 16448.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 2944] + - [45, 17928.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 704] + - [31, 15980.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 2944] + - [31, 18998.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [31, 9647.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 2368] + - [44, 17351.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 4288] + - [31, 17268.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [32, 13219.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 448] + - [38, 16964.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 704] + - [37, 18476.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1024] + - [38, 17944.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [32, 14420.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 6784] + - [33, 18687.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 704] + - [37, 18593.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 5888] + - [32, 17363.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 1408] + - [37, 18780.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 128] + - [43, 15352.0] + - - [704, 448, 1, 256, 704, 704, 256, 448] + - [34, 9072.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 1856] + - [32, 17554.0] + - - [128, 4288, 1, 256, 128, 128, 256, 4288] + - [38, 12030.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [34, 16080.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 256] + - [49, 17135.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 704] + - [37, 17101.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 448] + - [45, 18210.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [37, 13152.0] + - - [704, 1856, 1, 256, 704, 704, 256, 1856] + - [44, 17119.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 64] + - [49, 13186.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 2368] + - [39, 17437.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 448] + - [37, 17280.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 128] + - [37, 13846.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [37, 14803.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 448] + - [37, 15704.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 64] + - [48, 15671.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 448] + - [32, 17681.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 256] + - [38, 12738.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 448] + - [37, 18455.0] + - - [128, 5056, 1, 256, 128, 128, 256, 5056] + - [32, 14016.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 256] + - [35, 18187.0] + - - [704, 704, 1, 256, 704, 704, 256, 704] + - [37, 11662.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 128] + - [39, 17265.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1408] + - [34, 13610.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 64] + - [42, 16030.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 2944] + - [44, 14113.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 448] + - [37, 18409.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 448] + - [45, 18251.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 64] + - [41, 9184.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 5056] + - [46, 20088.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 64] + - [42, 10897.0] + - - [128, 2368, 1, 256, 128, 128, 256, 2368] + - [34, 8778.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [46, 16334.0] + - - [128, 2944, 1, 256, 128, 128, 256, 2944] + - [37, 9764.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 128] + - [42, 17001.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 448] + - [38, 16018.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 128] + - [45, 17854.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 704] + - [44, 18277.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1408] + - [44, 16835.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1408] + - [31, 18230.0] + - - [448, 2944, 1, 256, 448, 448, 256, 2944] + - [31, 17333.0] + - - [448, 2368, 1, 256, 448, 448, 256, 2368] + - [44, 15626.0] + - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [48, 9043.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 128] + - [43, 20197.0] + - - [448, 704, 1, 256, 448, 448, 256, 704] + - [48, 9175.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 256] + - [38, 17139.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 128] + - [44, 15037.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [31, 15176.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1024] + - [32, 16338.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [38, 16668.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 2368] + - [46, 19000.0] + - - [128, 3584, 1, 256, 128, 128, 256, 3584] + - [34, 11674.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 448] + - [44, 14449.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 704] + - [38, 16572.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [40, 12163.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1856] + - [37, 18497.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 256] + - [37, 14446.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 128] + - [45, 13941.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 448] + - [37, 15573.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 256] + - [33, 18497.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [38, 14228.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 448] + - [38, 13662.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 64] + - [42, 15069.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 1024] + - [37, 16598.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 448] + - [44, 18630.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 128] + - [37, 13602.0] + - - [448, 1024, 1, 256, 448, 448, 256, 1024] + - [37, 11628.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 5056] + - [33, 18482.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [47, 9858.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 5888] + - [38, 17952.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 128] + - [44, 15965.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 128] + - [37, 11867.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 256] + - [46, 18983.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 128] + - [45, 17245.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 3584] + - [39, 19731.0] + - - [128, 5888, 1, 256, 128, 128, 256, 5888] + - [38, 14398.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] + - [42, 15884.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [35, 17625.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [37, 14212.0] + - - [704, 1024, 1, 256, 704, 704, 256, 1024] + - [37, 13490.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 704] + - [37, 15040.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 2368] + - [32, 14047.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 256] + - [46, 19573.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 128] + - [36, 18474.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 1856] + - [37, 18485.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 448] + - [38, 11358.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 128] + - [45, 13259.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 2944] + - [38, 17313.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 1024] + - [34, 18627.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 448] + - [44, 18107.0] + - - [128, 6784, 1, 256, 128, 128, 256, 6784] + - [33, 15545.0] + - - [704, 1408, 1, 256, 704, 704, 256, 1408] + - [44, 16101.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [47, 9858.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 2944] + - [37, 18687.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [37, 11972.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 2368] + - [39, 15610.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 2368] + - [46, 17828.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 448] + - [31, 14660.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 704] + - [45, 17159.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 4288] + - [46, 15830.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 704] + - [37, 15694.0] + - - [448, 1856, 1, 256, 448, 448, 256, 1856] + - [37, 15139.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 704] + - [44, 18633.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 64] + - [80, 8628.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 64] + - [57, 5325.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 64] + - [57, 6644.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 256] + - [62, 8818.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 64] + - [57, 6812.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 128] + - [78, 5660.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 64] + - [67, 8979.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 128] + - [57, 8331.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 64] + - [57, 7970.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [62, 9794.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [71, 7609.0] + - - [448, 448, 1, 256, 448, 448, 256, 448] + - [71, 7298.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 1024] + - [62, 7900.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [74, 6293.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 128] + - [57, 7739.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 256] + - [62, 7079.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 704] + - [80, 5248.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 128] + - [57, 9960.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [78, 5281.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 448] + - [71, 8861.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 128] + - [71, 7639.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1856] + - [62, 9822.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [69, 5030.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 448] + - [62, 6887.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [74, 7950.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 64] + - [57, 6666.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 64] + - [57, 8875.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1024] + - [67, 7339.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 64] + - [78, 5279.0] + - - [704, 128, 1, 256, 704, 704, 256, 128] + - [78, 4225.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 256] + - [65, 6905.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [71, 7592.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 128] + - [57, 9867.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [57, 7788.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [74, 4985.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1024] + - [63, 9598.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 64] + - [57, 9526.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 128] + - [67, 7842.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [71, 7975.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [71, 6498.0] + - - [128, 1024, 1, 256, 128, 128, 256, 1024] + - [71, 5949.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [57, 6248.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [63, 4826.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 128] + - [57, 5505.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 128] + - [78, 6480.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [71, 7730.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 256] + - [62, 7752.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 448] + - [80, 6685.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [52, 8306.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 704] + - [71, 7984.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [57, 8047.0] + - - [128, 1408, 1, 256, 128, 128, 256, 1408] + - [71, 6535.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 64] + - [78, 4164.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [74, 6319.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 128] + - [80, 7732.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 64] + - [57, 8531.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 64] + - [57, 7824.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [74, 7700.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 64] + - [67, 5453.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 448] + - [62, 8609.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 256] + - [80, 8635.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 64] + - [80, 9880.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 1024] + - [57, 8669.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 64] + - [57, 6923.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [65, 5319.0] + - - [128, 704, 1, 256, 128, 128, 256, 704] + - [56, 4179.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 128] + - [57, 5866.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [71, 9526.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 64] + - [80, 7829.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [74, 6554.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 64] + - [57, 8199.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 1408] + - [77, 8450.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 704] + - [59, 5474.0] + - - [128, 1856, 1, 256, 128, 128, 256, 1856] + - [62, 8286.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [74, 7011.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 256] + - [70, 8132.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 704] + - [62, 7746.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [55, 4105.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1408] + - [71, 7767.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 1856] + - [80, 10178.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [78, 6554.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 64] + - [58, 4282.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 64] + - [68, 3048.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [74, 4772.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [81, 3918.0] + - - [128, 448, 1, 256, 128, 128, 256, 448] + - [66, 3352.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 256] + - [74, 5176.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [68, 3038.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [53, 547.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 64] + - [68, 1836.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 448] + - [64, 4223.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 64] + - [72, 3614.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 64] + - [58, 3259.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [53, 1055.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 128] + - [53, 1991.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [81, 2196.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [50, 3145.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [51, 1165.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 128] + - [72, 3112.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [53, 962.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [54, 3631.0] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [81, 2800.0] + - - [448, 128, 1, 256, 448, 448, 256, 128] + - [66, 3306.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [66, 3383.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 128] + - [74, 4533.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 256] + - [68, 3388.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [68, 1846.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [81, 1921.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [60, 293.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 256] + - [68, 3613.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [61, 948.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 128] + - [68, 1833.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [51, 2173.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [60, 583.0] + - - [704, 64, 1, 256, 704, 704, 256, 64] + - [72, 2658.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [53, 480.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [75, 1049.0] + - - [448, 64, 1, 256, 448, 448, 256, 64] + - [68, 1963.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 64] + - [76, 3554.0] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [73, 589.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 64] + - [74, 4702.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [74, 5173.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 128] + - [79, 4258.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 64] + - [52, 5127.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [53, 1997.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 256] + - [76, 4772.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 128] + - [81, 3702.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [51, 1152.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [81, 4243.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 448] + - [74, 4524.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 64] + - [61, 1973.0] + - - [128, 128, 1, 256, 128, 128, 256, 128] + - [51, 1152.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB.yaml new file mode 100644 index 000000000..6cfec9614 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB.yaml @@ -0,0 +1,38740 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x32_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 4288] + - [36, 12095.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 5888] + - [3, 11645.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1024] + - [29, 11978.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 1856] + - [29, 11973.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 704] + - [32, 10058.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 2944] + - [16, 12565.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 4288] + - [29, 10901.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 5056] + - [6, 10329.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 5056] + - [16, 12297.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 5888] + - [29, 12114.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3584] + - [5, 11590.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1408] + - [29, 12113.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 2368] + - [23, 9869.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1856] + - [29, 10714.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 5056] + - [29, 12263.0] + - - [448, 5056, 1, 256, 448, 448, 256, 5056] + - [21, 9290.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 1408] + - [19, 9407.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 256] + - [22, 10632.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 4288] + - [29, 12347.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 448] + - [9, 8925.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 2368] + - [3, 11117.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 2944] + - [29, 12107.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 5056] + - [10, 11003.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 704] + - [3, 10150.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [10, 9232.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 4288] + - [18, 11723.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 1024] + - [26, 11136.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 5056] + - [24, 10559.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 2944] + - [29, 11361.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 5056] + - [22, 12478.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 5056] + - [36, 11969.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 6784] + - [33, 11346.0] + - - [704, 5056, 1, 128, 704, 704, 128, 5056] + - [25, 9616.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 2944] + - [3, 12028.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 6784] + - [16, 12609.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 4288] + - [16, 11963.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 4288] + - [29, 12285.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 704] + - [29, 9901.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 4288] + - [10, 12283.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 2368] + - [29, 12031.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 448] + - [9, 10225.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 2944] + - [34, 10620.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 2944] + - [36, 11720.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 704] + - [10, 11086.0] + - - [448, 5888, 1, 128, 448, 448, 128, 5888] + - [12, 8918.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 2368] + - [29, 12032.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 3584] + - [13, 9614.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 5888] + - [29, 12431.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 2944] + - [16, 11696.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 1408] + - [6, 9780.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 2368] + - [16, 11308.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 6784] + - [3, 11080.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 704] + - [16, 11149.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 1856] + - [27, 9908.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 5056] + - [16, 11598.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 1856] + - [10, 11535.0] + - - [704, 5888, 1, 256, 704, 704, 256, 5888] + - [1, 10465.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 6784] + - [29, 12627.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 704] + - [16, 10240.0] + - - [448, 4288, 1, 256, 448, 448, 256, 4288] + - [2, 8658.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 2368] + - [5, 9796.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 2368] + - [29, 11062.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 4288] + - [16, 11668.0] + - - [704, 2944, 1, 128, 704, 704, 128, 2944] + - [25, 9180.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1024] + - [16, 10348.0] + - - [704, 6784, 1, 256, 704, 704, 256, 6784] + - [16, 10257.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 704] + - [10, 10223.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 1408] + - [26, 10520.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 4288] + - [29, 12341.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1856] + - [29, 11926.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 1024] + - [16, 12258.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 4288] + - [6, 10083.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 3584] + - [16, 11982.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 6784] + - [16, 12111.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3584] + - [16, 12333.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 2944] + - [10, 12288.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 2368] + - [29, 11835.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 2368] + - [27, 10062.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 6784] + - [16, 12563.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 5888] + - [16, 12119.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 4288] + - [29, 12245.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 5056] + - [16, 12203.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 5888] + - [33, 11706.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 5888] + - [5, 11121.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 3584] + - [20, 10855.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 5888] + - [3, 12630.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 5056] + - [16, 12060.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 1024] + - [26, 10765.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 2368] + - [25, 10806.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 448] + - [28, 9984.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 5888] + - [31, 11207.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 2944] + - [36, 11782.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 1024] + - [18, 11361.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 5056] + - [16, 12338.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 1856] + - [3, 10828.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 2368] + - [16, 10417.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 4288] + - [16, 12176.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 4288] + - [26, 10984.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 448] + - [25, 9753.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 6784] + - [3, 11324.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 5888] + - [29, 12617.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1024] + - [16, 12098.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 5888] + - [5, 9975.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 2944] + - [3, 11170.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 5888] + - [29, 12451.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 5888] + - [26, 11610.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 3584] + - [33, 10791.0] + - - [448, 3584, 1, 128, 448, 448, 128, 3584] + - [23, 8535.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 2944] + - [10, 12565.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 5888] + - [1, 10953.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 5888] + - [29, 12271.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 704] + - [25, 8943.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 2944] + - [10, 12276.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 2368] + - [25, 10747.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 704] + - [32, 10234.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 1408] + - [29, 12330.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 1024] + - [18, 12109.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 2944] + - [29, 12639.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 5056] + - [29, 12446.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 1856] + - [14, 9805.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 5888] + - [10, 10731.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 2368] + - [29, 12102.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 5888] + - [29, 12343.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 4288] + - [10, 11845.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1856] + - [3, 11666.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 2944] + - [10, 11594.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 6784] + - [3, 10666.0] + - - [256, 5056, 1, 128, 256, 256, 128, 5056] + - [25, 8775.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 1024] + - [20, 11467.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 1856] + - [29, 11970.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 1408] + - [27, 9727.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 1408] + - [7, 10651.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 5056] + - [10, 11941.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 256] + - [5, 11584.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 5888] + - [16, 12024.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 2368] + - [20, 11328.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 3584] + - [29, 12069.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1024] + - [18, 11384.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 4288] + - [16, 12355.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1856] + - [16, 10652.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 2944] + - [36, 11853.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 1856] + - [25, 11084.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 1024] + - [2, 9711.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 3584] + - [33, 11649.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 5888] + - [29, 12638.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 1024] + - [26, 10540.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 2368] + - [29, 11423.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 5888] + - [16, 12473.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 1024] + - [1, 8691.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 448] + - [15, 9796.0] + - - [448, 5888, 1, 256, 448, 448, 256, 5888] + - [13, 8881.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 6784] + - [29, 12258.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 704] + - [25, 9293.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 2944] + - [10, 11959.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 5888] + - [16, 12052.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 1856] + - [26, 10315.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3584] + - [16, 12027.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 6784] + - [22, 11738.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1408] + - [29, 12238.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 4288] + - [20, 11398.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 1856] + - [25, 10350.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 5888] + - [5, 12255.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 6784] + - [16, 12421.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 2368] + - [33, 10804.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 4288] + - [36, 10676.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 2944] + - [10, 11508.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1856] + - [3, 11972.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 6784] + - [29, 12087.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 5056] + - [10, 12170.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 448] + - [28, 9287.0] + - - [448, 4288, 1, 128, 448, 448, 128, 4288] + - [14, 8634.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 4288] + - [16, 12181.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [19, 9607.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 4288] + - [25, 10724.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 704] + - [3, 10895.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 3584] + - [10, 11943.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 5056] + - [10, 12080.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 2368] + - [29, 12003.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 3584] + - [29, 11453.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 3584] + - [29, 12270.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 2368] + - [7, 10855.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 2944] + - [29, 12309.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 6784] + - [16, 12174.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 3584] + - [14, 10561.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 448] + - [15, 9901.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 6784] + - [36, 11797.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 1856] + - [10, 11439.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 1856] + - [21, 10192.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 6784] + - [29, 12243.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 3584] + - [16, 12368.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 5888] + - [29, 11987.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 5888] + - [29, 11682.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 2368] + - [16, 12078.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 4288] + - [29, 11612.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 3584] + - [29, 11917.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 6784] + - [29, 12139.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1408] + - [29, 10590.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 704] + - [25, 10390.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 5888] + - [29, 11785.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 2944] + - [29, 10673.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 2368] + - [12, 9480.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 704] + - [29, 10149.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 6784] + - [16, 11832.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 448] + - [15, 10117.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 2368] + - [5, 10311.0] + - - [256, 5888, 1, 128, 256, 256, 128, 5888] + - [23, 8019.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 2944] + - [29, 12267.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 2368] + - [10, 11761.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 704] + - [16, 11291.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 4288] + - [16, 11549.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 2944] + - [3, 12074.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 704] + - [15, 8897.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 5056] + - [36, 11500.0] + - - [448, 5056, 1, 128, 448, 448, 128, 5056] + - [14, 8916.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 5056] + - [20, 11083.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 3584] + - [20, 11549.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 2368] + - [16, 11551.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 5056] + - [29, 12453.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 5056] + - [13, 10906.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3584] + - [29, 12394.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 6784] + - [16, 12390.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 2944] + - [16, 12189.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 704] + - [16, 10152.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 5056] + - [5, 11199.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 5888] + - [29, 12578.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 5888] + - [16, 12649.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 4288] + - [10, 12324.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 3584] + - [10, 11741.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 1856] + - [25, 10030.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 1024] + - [3, 11615.0] + - - [704, 3584, 1, 128, 704, 704, 128, 3584] + - [25, 9238.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 448] + - [35, 10022.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 4288] + - [36, 11777.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 2944] + - [7, 11193.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 6784] + - [16, 12105.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 2944] + - [16, 12426.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 3584] + - [29, 11736.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 1408] + - [10, 10592.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 3584] + - [36, 11148.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 704] + - [35, 9667.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 1408] + - [3, 11997.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 3584] + - [10, 10321.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 6784] + - [29, 12562.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 6784] + - [16, 11471.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 448] + - [9, 10128.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 4288] + - [36, 11627.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 704] + - [3, 11195.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 1024] + - [29, 12112.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 6784] + - [3, 11180.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 5056] + - [16, 11955.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 3584] + - [6, 10008.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 1408] + - [0, 8941.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 2944] + - [33, 10811.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 2944] + - [33, 11391.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 5056] + - [10, 12463.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 2368] + - [12, 9879.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 2368] + - [22, 11115.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 6784] + - [16, 12510.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 5888] + - [29, 11442.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 4288] + - [16, 12217.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1408] + - [29, 11950.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 5056] + - [10, 11328.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 2368] + - [10, 11394.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 5056] + - [29, 12364.0] + - - [448, 6784, 1, 256, 448, 448, 256, 6784] + - [20, 9572.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 2368] + - [10, 12179.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 1856] + - [29, 11721.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 448] + - [19, 8664.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1024] + - [16, 11513.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 5056] + - [16, 11352.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 4288] + - [10, 10753.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3584] + - [16, 12540.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3584] + - [16, 12440.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1408] + - [16, 11174.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 2944] + - [29, 12364.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 6784] + - [10, 11362.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 1408] + - [33, 10595.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 6784] + - [29, 12489.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 4288] + - [16, 11786.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 6784] + - [29, 12108.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 4288] + - [16, 12047.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 1408] + - [33, 11498.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 1024] + - [29, 10989.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 5888] + - [16, 12395.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1024] + - [16, 11694.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 6784] + - [10, 10387.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1408] + - [10, 11893.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 1856] + - [29, 11612.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 2944] + - [33, 11470.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 5888] + - [16, 12534.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1856] + - [3, 12064.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 5056] + - [10, 11831.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 256] + - [29, 10877.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 5888] + - [16, 12012.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 1408] + - [29, 10883.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3584] + - [3, 10524.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 448] + - [16, 10143.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 1856] + - [16, 12068.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 1024] + - [27, 10438.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 2368] + - [0, 9349.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 4288] + - [16, 11837.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1408] + - [3, 10159.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 5056] + - [10, 12227.0] + - - [448, 6784, 1, 128, 448, 448, 128, 6784] + - [12, 8943.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 6784] + - [36, 12330.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 2368] + - [3, 11241.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 3584] + - [29, 12001.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1408] + - [29, 11939.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 448] + - [25, 8941.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 256] + - [24, 11110.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 1408] + - [3, 10865.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 4288] + - [16, 11874.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 448] + - [19, 9716.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 2368] + - [29, 12095.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1856] + - [10, 10930.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 1856] + - [29, 11637.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 5888] + - [10, 11548.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 2368] + - [29, 10530.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 1408] + - [27, 9745.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 2368] + - [25, 10313.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 1408] + - [33, 11325.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 5888] + - [36, 12042.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 5056] + - [16, 11651.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 3584] + - [7, 11550.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 5056] + - [16, 12426.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 1024] + - [12, 8957.0] + - - [704, 4288, 1, 256, 704, 704, 256, 4288] + - [33, 9576.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 2368] + - [16, 12126.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 5888] + - [16, 12143.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 256] + - [16, 10593.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 1856] + - [16, 11086.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 704] + - [27, 9308.0] + - - [704, 3584, 1, 256, 704, 704, 256, 3584] + - [1, 9318.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 2944] + - [3, 10936.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 1024] + - [20, 10530.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 1024] + - [10, 11279.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 5056] + - [16, 11250.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 6784] + - [29, 11844.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 1408] + - [29, 12278.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 6784] + - [26, 11672.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 2944] + - [16, 12595.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 1856] + - [12, 10340.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 2944] + - [7, 10423.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 448] + - [25, 10205.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 5056] + - [5, 10355.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 1856] + - [25, 10191.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 704] + - [9, 9765.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 704] + - [10, 10412.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 1024] + - [37, 10046.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 5888] + - [3, 10920.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 4288] + - [29, 12129.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 4288] + - [10, 12113.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [34, 9834.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1024] + - [16, 12053.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 1024] + - [20, 10879.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 704] + - [29, 10693.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 3584] + - [29, 12512.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 6784] + - [16, 11991.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 2944] + - [29, 11999.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 2368] + - [16, 11445.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 1856] + - [25, 10594.0] + - - [256, 6784, 1, 128, 256, 256, 128, 6784] + - [4, 9029.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 6784] + - [16, 11724.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 5056] + - [10, 11399.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 5888] + - [7, 11086.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 5888] + - [16, 12310.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 1856] + - [10, 11463.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 3584] + - [16, 12302.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 4288] + - [3, 10346.0] + - - [704, 5888, 1, 128, 704, 704, 128, 5888] + - [21, 9718.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 3584] + - [33, 11882.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 5056] + - [16, 12287.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 1408] + - [12, 9187.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 2368] + - [10, 11639.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 704] + - [3, 10377.0] + - - [448, 3584, 1, 256, 448, 448, 256, 3584] + - [13, 8955.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1024] + - [29, 11046.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 1408] + - [10, 12089.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 1408] + - [16, 10421.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 5888] + - [16, 12557.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 3584] + - [16, 12236.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 1856] + - [22, 10864.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1408] + - [29, 12227.0] + - - [704, 2944, 1, 256, 704, 704, 256, 2944] + - [2, 9914.0] + - - [704, 4288, 1, 128, 704, 704, 128, 4288] + - [12, 9257.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 4288] + - [33, 10806.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 6784] + - [10, 10900.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1408] + - [16, 10470.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 4288] + - [16, 10144.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 1408] + - [10, 9990.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 5056] + - [10, 11672.0] + - - [704, 2368, 1, 256, 704, 704, 256, 2368] + - [34, 9135.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 2368] + - [3, 12212.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 448] + - [15, 9962.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 704] + - [31, 11196.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 256] + - [6, 7727.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 5888] + - [7, 11123.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 1024] + - [27, 9557.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 1856] + - [6, 9037.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 6784] + - [29, 12468.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 5056] + - [5, 11209.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 5056] + - [16, 12466.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 2944] + - [16, 12104.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 3584] + - [29, 11135.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 3584] + - [16, 12410.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 2944] + - [29, 11427.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 2368] + - [16, 12003.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 1408] + - [29, 11364.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 1408] + - [29, 12121.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 1024] + - [25, 10128.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 5056] + - [16, 12252.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 6784] + - [29, 12629.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 5056] + - [29, 11762.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 1408] + - [10, 12264.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [19, 9467.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3584] + - [5, 9982.0] + - - [704, 2368, 1, 128, 704, 704, 128, 2368] + - [30, 8101.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 256] + - [11, 8032.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 1856] + - [25, 10148.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 4288] + - [36, 11366.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 1024] + - [5, 11711.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 5056] + - [16, 11036.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 1408] + - [29, 11443.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 448] + - [28, 9633.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 6784] + - [29, 12022.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 5056] + - [3, 12475.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 704] + - [10, 10978.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 6784] + - [29, 12350.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 1408] + - [16, 11659.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 5888] + - [36, 11697.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 5888] + - [29, 12164.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 1024] + - [7, 10084.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 1856] + - [22, 10973.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 6784] + - [16, 12070.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 3584] + - [29, 11892.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1856] + - [3, 11925.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 1024] + - [16, 10562.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 3584] + - [29, 12513.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3584] + - [29, 12077.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 2944] + - [17, 10131.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 6784] + - [10, 10323.0] + - - [704, 5056, 1, 256, 704, 704, 256, 5056] + - [7, 10336.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1856] + - [29, 11528.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [8, 10202.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3584] + - [16, 12061.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 5888] + - [16, 12551.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 3584] + - [29, 12291.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 5888] + - [29, 11955.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 448] + - [36, 10463.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 1408] + - [29, 11190.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 1408] + - [29, 11098.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 2368] + - [10, 11788.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 2368] + - [10, 11809.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 1856] + - [10, 11239.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 2944] + - [10, 10981.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 1024] + - [7, 10914.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 256] + - [27, 9396.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 704] + - [25, 10428.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 4288] + - [3, 11779.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 5056] + - [16, 12095.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 1024] + - [16, 11856.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 5056] + - [18, 12246.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 1856] + - [3, 11490.0] + - - [704, 6784, 1, 128, 704, 704, 128, 6784] + - [20, 9606.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 6784] + - [29, 12112.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 2944] + - [16, 12324.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 2944] + - [16, 12229.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 6784] + - [16, 11377.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 4288] + - [16, 9996.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 4288] + - [10, 11934.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 3584] + - [10, 10869.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 5056] + - [20, 10813.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 6784] + - [3, 12626.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 5888] + - [13, 9770.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 448] + - [25, 9204.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 2944] + - [16, 11176.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 256] + - [10, 10473.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 5888] + - [29, 12197.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1856] + - [16, 11006.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 4288] + - [10, 11192.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 4288] + - [10, 12371.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 704] + - [8, 8939.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 1408] + - [37, 10617.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 5056] + - [16, 11912.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1024] + - [29, 11236.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 4288] + - [29, 11371.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 2368] + - [5, 11269.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 1856] + - [36, 12114.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 2944] + - [6, 10054.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 3584] + - [10, 11523.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 5888] + - [16, 12379.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 2944] + - [10, 11347.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 6784] + - [22, 12249.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 6784] + - [16, 10497.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 3584] + - [26, 11301.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 1856] + - [25, 11113.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 704] + - [25, 9654.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 5888] + - [16, 10819.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 6784] + - [16, 12393.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 1408] + - [8, 9454.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 1024] + - [25, 9209.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3584] + - [16, 12576.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 5056] + - [3, 12044.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 2368] + - [29, 11198.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 4288] + - [29, 11282.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 2944] + - [16, 11892.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [7, 9836.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 5056] + - [5, 11454.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 1856] + - [25, 10790.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 1408] + - [10, 11833.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3584] + - [16, 12358.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 2368] + - [16, 10933.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 6784] + - [16, 12640.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 4288] + - [26, 10172.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 448] + - [10, 10375.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 1024] + - [2, 10163.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [29, 12326.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [29, 12127.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [29, 12599.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [36, 12367.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [29, 11818.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [10, 12456.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [29, 12466.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [31, 12322.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [25, 9234.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [31, 10752.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [29, 11607.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [16, 11934.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [29, 12062.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [16, 12427.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [29, 12596.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [16, 12526.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [29, 12597.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [22, 12541.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [16, 12613.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [29, 12620.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [18, 12563.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [16, 12496.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [18, 12551.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [16, 12370.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [36, 12397.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 448] + - [51, 9285.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 6784] + - [64, 9772.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 448] + - [57, 8092.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 4288] + - [78, 9953.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 1856] + - [79, 9094.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1024] + - [57, 8280.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 1408] + - [59, 7190.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1856] + - [60, 8853.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [44, 6333.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 128] + - [50, 5125.0] + - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [84, 5365.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [85, 7484.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1856] + - [45, 8641.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 3584] + - [78, 8500.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [69, 7832.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [62, 4345.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 64] + - [53, 5346.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 256] + - [81, 9009.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 448] + - [81, 9048.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 704] + - [61, 8954.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 64] + - [72, 5801.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [85, 7426.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 1408] + - [45, 8814.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 256] + - [59, 7608.0] + - - [448, 2944, 1, 128, 448, 448, 128, 2944] + - [57, 7702.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 128] + - [46, 8604.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 128] + - [48, 5414.0] + - - [448, 1408, 1, 256, 448, 448, 256, 1408] + - [67, 7360.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [43, 8569.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 448] + - [66, 5768.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3584] + - [41, 10492.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 64] + - [87, 7575.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 704] + - [83, 7709.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [87, 5855.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 256] + - [79, 6804.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 128] + - [86, 8424.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 64] + - [71, 5283.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 256] + - [54, 8473.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [43, 7393.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1024] + - [85, 8416.0] + - - [448, 1856, 1, 128, 448, 448, 128, 1856] + - [78, 6290.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 704] + - [54, 8279.0] + - - [128, 5888, 1, 256, 128, 128, 256, 5888] + - [49, 7824.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 704] + - [79, 8242.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1408] + - [85, 8650.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 256] + - [69, 10233.0] + - - [704, 1856, 1, 128, 704, 704, 128, 1856] + - [67, 7934.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3584] + - [40, 9173.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 2944] + - [43, 7199.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 128] + - [68, 7058.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 1408] + - [52, 9131.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [55, 9139.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 2944] + - [59, 9363.0] + - - [448, 2368, 1, 128, 448, 448, 128, 2368] + - [76, 8092.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 704] + - [82, 7803.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 2944] + - [60, 9195.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [75, 6083.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 128] + - [74, 9654.0] + - - [704, 704, 1, 256, 704, 704, 256, 704] + - [41, 6480.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 4288] + - [41, 9187.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 704] + - [43, 7631.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 448] + - [63, 8969.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 704] + - [63, 8913.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1408] + - [61, 9039.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1024] + - [63, 9018.0] + - - [448, 1024, 1, 128, 448, 448, 128, 1024] + - [77, 5931.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 2368] + - [41, 9189.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 64] + - [62, 5113.0] + - - [704, 1024, 1, 256, 704, 704, 256, 1024] + - [85, 7226.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 6784] + - [60, 9555.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [49, 6443.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 4288] + - [54, 9437.0] + - - [256, 1856, 1, 128, 256, 256, 128, 1856] + - [68, 5172.0] + - - [448, 1408, 1, 128, 448, 448, 128, 1408] + - [76, 6117.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 128] + - [51, 8161.0] + - - [704, 448, 1, 256, 704, 704, 256, 448] + - [67, 5340.0] + - - [704, 1408, 1, 128, 704, 704, 128, 1408] + - [38, 6866.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 448] + - [78, 7605.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [56, 4425.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 704] + - [81, 8715.0] + - - [128, 4288, 1, 256, 128, 128, 256, 4288] + - [59, 6439.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 448] + - [61, 8232.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 1024] + - [88, 9309.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 2368] + - [79, 8913.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [61, 8419.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 256] + - [54, 8914.0] + - - [256, 2368, 1, 128, 256, 256, 128, 2368] + - [59, 6965.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 704] + - [51, 8147.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 448] + - [81, 9445.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [86, 7073.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 448] + - [69, 9003.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [68, 6129.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] + - [49, 8234.0] + - - [704, 1856, 1, 256, 704, 704, 256, 1856] + - [85, 8198.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [43, 5906.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 2368] + - [41, 9288.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 448] + - [54, 7975.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 448] + - [86, 6711.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 128] + - [55, 7993.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 256] + - [51, 6795.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [61, 7759.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 256] + - [86, 9360.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 448] + - [72, 9061.0] + - - [128, 3584, 1, 256, 128, 128, 256, 3584] + - [59, 6517.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 448] + - [80, 7606.0] + - - [128, 5056, 1, 256, 128, 128, 256, 5056] + - [85, 7531.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 256] + - [63, 9407.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 128] + - [69, 8798.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 256] + - [66, 4336.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1408] + - [59, 6839.0] + - - [128, 2368, 1, 256, 128, 128, 256, 2368] + - [49, 5503.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 64] + - [54, 8228.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 2944] + - [49, 7603.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 448] + - [81, 9203.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [49, 8145.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 128] + - [85, 7342.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 448] + - [63, 9343.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 64] + - [48, 5365.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 704] + - [47, 6143.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 5056] + - [41, 10658.0] + - - [704, 1024, 1, 128, 704, 704, 128, 1024] + - [48, 6554.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 128] + - [70, 6290.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 448] + - [83, 6034.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 5888] + - [59, 9016.0] + - - [704, 448, 1, 128, 704, 704, 128, 448] + - [39, 4304.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [89, 8317.0] + - - [128, 2944, 1, 256, 128, 128, 256, 2944] + - [85, 5570.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [51, 6485.0] + - - [448, 1856, 1, 256, 448, 448, 256, 1856] + - [86, 7619.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 128] + - [88, 8795.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 448] + - [81, 8403.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 128] + - [46, 8555.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 704] + - [61, 8654.0] + - - [448, 2944, 1, 256, 448, 448, 256, 2944] + - [67, 8366.0] + - - [448, 2368, 1, 256, 448, 448, 256, 2368] + - [60, 7768.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 2368] + - [46, 8858.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 128] + - [86, 9940.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 64] + - [52, 8611.0] + - - [64, 5888, 1, 128, 64, 64, 128, 5888] + - [75, 4542.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 128] + - [86, 10427.0] + - - [448, 704, 1, 256, 448, 448, 256, 704] + - [67, 5426.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 128] + - [72, 7349.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 5056] + - [79, 9930.0] + - - [704, 704, 1, 128, 704, 704, 128, 704] + - [83, 5051.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [52, 7874.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [65, 4522.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [67, 5942.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [81, 8966.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 1024] + - [61, 8732.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 2368] + - [60, 10026.0] + - - [256, 3584, 1, 128, 256, 256, 128, 3584] + - [69, 6892.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 256] + - [69, 7280.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [45, 6456.0] + - - [256, 2944, 1, 128, 256, 256, 128, 2944] + - [41, 6718.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 256] + - [76, 7258.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 448] + - [69, 7624.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 256] + - [81, 9641.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 704] + - [57, 7845.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [83, 5282.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 448] + - [63, 7190.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 64] + - [72, 7946.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [47, 5071.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [88, 6991.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 448] + - [81, 8991.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1856] + - [79, 8972.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 256] + - [57, 5270.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 128] + - [54, 7324.0] + - - [448, 1024, 1, 256, 448, 448, 256, 1024] + - [67, 6561.0] + - - [64, 6784, 1, 128, 64, 64, 128, 6784] + - [73, 4511.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 64] + - [52, 6901.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [59, 5036.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [85, 5291.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 5888] + - [45, 9069.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 256] + - [76, 6829.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 1024] + - [40, 7086.0] + - - [64, 5056, 1, 128, 64, 64, 128, 5056] + - [70, 4542.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 64] + - [84, 4377.0] + - - [448, 704, 1, 128, 448, 448, 128, 704] + - [58, 4223.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 704] + - [59, 6971.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 256] + - [86, 9925.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 128] + - [54, 8632.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 3584] + - [41, 10045.0] + - - [256, 1408, 1, 128, 256, 256, 128, 1408] + - [42, 4241.0] + - - [256, 4288, 1, 128, 256, 256, 128, 4288] + - [78, 7206.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [67, 6240.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 256] + - [72, 9078.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [80, 6948.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 64] + - [83, 4654.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 704] + - [79, 7748.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 2368] + - [46, 7747.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 256] + - [74, 9914.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 128] + - [54, 8322.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 1856] + - [60, 9451.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 448] + - [49, 6598.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 128] + - [88, 7070.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [42, 5215.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 2944] + - [57, 9007.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 1024] + - [68, 8925.0] + - - [128, 6784, 1, 256, 128, 128, 256, 6784] + - [46, 8078.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 1856] + - [81, 9178.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [51, 6688.0] + - - [704, 1408, 1, 256, 704, 704, 256, 1408] + - [68, 7827.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [40, 5359.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 2944] + - [60, 9006.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 128] + - [86, 9615.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 448] + - [87, 6732.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 4288] + - [60, 8853.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 704] + - [70, 8304.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 704] + - [86, 9154.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [85, 8373.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [51, 8718.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [115, 2589.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 64] + - [124, 4478.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 64] + - [107, 1797.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 64] + - [101, 3101.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 64] + - [124, 2941.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 256] + - [105, 3988.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 64] + - [103, 3572.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 128] + - [128, 3039.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 64] + - [124, 4138.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 64] + - [140, 3831.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [95, 4549.0] + - - [704, 256, 1, 128, 704, 704, 128, 256] + - [95, 3036.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [131, 3035.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 64] + - [124, 4102.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [124, 3732.0] + - - [448, 448, 1, 256, 448, 448, 256, 448] + - [115, 3713.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 1024] + - [95, 3876.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [130, 3811.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [115, 3741.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 128] + - [95, 3808.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 256] + - [130, 3761.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [140, 2815.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 704] + - [128, 3218.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 128] + - [140, 4697.0] + - - [64, 2944, 1, 128, 64, 64, 128, 2944] + - [115, 3148.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 448] + - [105, 3965.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 128] + - [128, 3620.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1856] + - [95, 4635.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [122, 3171.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [95, 3889.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [112, 3285.0] + - - [128, 1408, 1, 256, 128, 128, 256, 1408] + - [131, 3343.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 64] + - [105, 3563.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 448] + - [130, 3743.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [95, 4154.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 64] + - [105, 3154.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [103, 2349.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 256] + - [105, 2796.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 64] + - [103, 2364.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 64] + - [122, 2534.0] + - - [704, 128, 1, 256, 704, 704, 256, 128] + - [128, 2615.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 256] + - [130, 3502.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 128] + - [124, 4512.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [105, 3529.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 64] + - [140, 2796.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1024] + - [95, 3892.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 64] + - [105, 4338.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 1856] + - [95, 4635.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [124, 3692.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [105, 4091.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [122, 3249.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [128, 3358.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 64] + - [105, 3222.0] + - - [64, 1408, 1, 128, 64, 64, 128, 1408] + - [101, 1506.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 256] + - [112, 3715.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [105, 2927.0] + - - [448, 256, 1, 128, 448, 448, 128, 256] + - [130, 1858.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 128] + - [128, 3258.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [91, 1502.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 448] + - [130, 3497.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 256] + - [128, 3561.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [95, 4369.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 64] + - [103, 1915.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [120, 1542.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 704] + - [91, 3670.0] + - - [256, 448, 1, 128, 256, 256, 128, 448] + - [138, 1887.0] + - - [64, 3584, 1, 128, 64, 64, 128, 3584] + - [95, 2913.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 128] + - [124, 2737.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 64] + - [105, 3599.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 1408] + - [91, 3639.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 64] + - [120, 2016.0] + - - [64, 1856, 1, 128, 64, 64, 128, 1856] + - [103, 1969.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [95, 3099.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 128] + - [140, 3807.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [105, 3035.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 64] + - [124, 4359.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1024] + - [95, 3599.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [95, 3978.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 64] + - [120, 3173.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [115, 3689.0] + - - [256, 704, 1, 128, 256, 256, 128, 704] + - [140, 2325.0] + - - [256, 1024, 1, 128, 256, 256, 128, 1024] + - [105, 2801.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [112, 2976.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 448] + - [115, 3885.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 256] + - [115, 3999.0] + - - [128, 1024, 1, 256, 128, 128, 256, 1024] + - [124, 2676.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 64] + - [105, 4585.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 128] + - [120, 2831.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 1024] + - [95, 3995.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 64] + - [120, 3874.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [138, 2518.0] + - - [128, 704, 1, 256, 128, 128, 256, 704] + - [120, 2144.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [95, 4305.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 64] + - [124, 3537.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [130, 3834.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 128] + - [128, 3661.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 704] + - [112, 3241.0] + - - [128, 1856, 1, 256, 128, 128, 256, 1856] + - [124, 3655.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [95, 3407.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 704] + - [120, 3669.0] + - - [64, 2368, 1, 128, 64, 64, 128, 2368] + - [105, 2343.0] + - - [64, 4288, 1, 128, 64, 64, 128, 4288] + - [95, 2894.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 128] + - [140, 3664.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [120, 2063.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 64] + - [105, 3733.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1408] + - [101, 3537.0] + - - [448, 448, 1, 128, 448, 448, 128, 448] + - [93, 2564.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [124, 2869.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [131, 3659.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [124, 3403.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 4] + - [142, 620.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 4] + - [142, 442.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 4] + - [142, 473.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 4] + - [148, 655.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 4] + - [144, 411.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 4] + - [147, 232.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 4] + - [144, 282.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 4] + - [147, 399.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 4] + - [144, 652.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 4] + - [144, 371.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 4] + - [144, 292.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 4] + - [145, 225.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 4] + - [145, 634.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 4] + - [137, 314.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 4] + - [147, 399.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 4] + - [147, 413.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 4] + - [144, 336.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 4] + - [144, 592.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 4] + - [147, 569.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 4] + - [142, 196.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 4] + - [144, 168.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 4] + - [147, 320.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 4] + - [146, 503.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 4] + - [142, 479.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 4] + - [144, 424.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 4] + - [142, 478.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 4] + - [147, 527.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 4] + - [143, 380.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 4] + - [142, 566.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 4] + - [147, 256.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 4] + - [142, 123.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 4] + - [147, 431.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 4] + - [147, 627.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 4] + - [144, 521.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 4] + - [147, 446.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 4] + - [148, 701.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 1856] + - [153, 708.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 2944] + - [158, 735.0] + - - [4, 1408, 1, 128, 4, 4, 128, 1408] + - [125, 205.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 2368] + - [161, 755.0] + - - [4, 3584, 1, 128, 4, 4, 128, 3584] + - [151, 480.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 5888] + - [163, 616.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 1408] + - [161, 512.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 6784] + - [149, 497.0] + - - [4, 4288, 1, 128, 4, 4, 128, 4288] + - [111, 345.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 5056] + - [156, 977.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 6784] + - [150, 847.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 2944] + - [161, 817.0] + - - [4, 5056, 1, 256, 4, 4, 256, 5056] + - [151, 787.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 5056] + - [152, 969.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 2368] + - [153, 826.0] + - - [4, 1856, 1, 256, 4, 4, 256, 1856] + - [106, 348.0] + - - [4, 2368, 1, 256, 4, 4, 256, 2368] + - [152, 452.0] + - - [4, 2944, 1, 256, 4, 4, 256, 2944] + - [150, 373.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 4288] + - [155, 801.0] + - - [4, 6784, 1, 128, 4, 4, 128, 6784] + - [150, 462.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 3584] + - [160, 780.0] + - - [4, 5888, 1, 256, 4, 4, 256, 5888] + - [149, 613.0] + - - [4, 6784, 1, 256, 4, 4, 256, 6784] + - [159, 617.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1408] + - [161, 419.0] + - - [4, 3584, 1, 256, 4, 4, 256, 3584] + - [157, 450.0] + - - [4, 1408, 1, 256, 4, 4, 256, 1408] + - [125, 191.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 4288] + - [154, 893.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 5888] + - [162, 944.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1856] + - [161, 552.0] + - - [4, 1856, 1, 128, 4, 4, 128, 1856] + - [149, 242.0] + - - [4, 2944, 1, 128, 4, 4, 128, 2944] + - [150, 236.0] + - - [4, 5056, 1, 128, 4, 4, 128, 5056] + - [156, 398.0] + - - [4, 4288, 1, 256, 4, 4, 256, 4288] + - [150, 496.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3584] + - [155, 920.0] + - - [4, 5888, 1, 128, 4, 4, 128, 5888] + - [162, 442.0] + - - [4, 2368, 1, 128, 4, 4, 128, 2368] + - [152, 199.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 704] + - [96, 216.0] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [102, 507.0] + - - [64, 4, 1, 256, 64, 64, 256, 4] + - [106, 14.0] + - - [64, 704, 1, 128, 64, 64, 128, 704] + - [108, 1182.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 64] + - [116, 1617.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 4] + - [99, 39.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [120, 2704.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [111, 2015.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 64] + - [122, 1202.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [133, 3015.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 64] + - [101, 2794.0] + - - [4, 704, 1, 256, 4, 4, 256, 704] + - [106, 102.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 4] + - [106, 206.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [100, 1011.0] + - - [64, 1024, 1, 128, 64, 64, 128, 1024] + - [136, 1192.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 64] + - [99, 20.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 256] + - [114, 1862.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [96, 1611.0] + - - [448, 4, 1, 256, 448, 448, 256, 4] + - [100, 65.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 4] + - [106, 137.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [92, 11.0] + - - [256, 4, 1, 128, 256, 256, 128, 4] + - [119, 23.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 64] + - [136, 2152.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [119, 321.0] + - - [704, 64, 1, 128, 704, 704, 128, 64] + - [109, 1052.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 4] + - [100, 142.0] + - - [256, 256, 1, 128, 256, 256, 128, 256] + - [120, 1219.0] + - - [64, 256, 1, 128, 64, 64, 128, 256] + - [98, 373.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 64] + - [111, 1952.0] + - - [128, 448, 1, 256, 128, 128, 256, 448] + - [101, 1545.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 256] + - [119, 1686.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 64] + - [116, 1804.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [104, 728.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [134, 749.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 128] + - [98, 1300.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [100, 1046.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [96, 1890.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 256] + - [91, 3122.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 4] + - [141, 312.0] + - - [4, 4, 1, 256, 4, 4, 256, 4] + - [90, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [135, 613.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 128] + - [119, 1722.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [110, 603.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 448] + - [134, 164.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [124, 1691.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 4] + - [99, 77.0] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [113, 1571.0] + - - [4, 704, 1, 128, 4, 4, 128, 704] + - [135, 105.0] + - - [448, 128, 1, 256, 448, 448, 256, 128] + - [125, 1642.0] + - - [448, 64, 1, 128, 448, 448, 128, 64] + - [94, 626.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 448] + - [99, 132.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [120, 1781.0] + - - [256, 64, 1, 128, 256, 256, 128, 64] + - [119, 394.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 1024] + - [96, 307.0] + - - [704, 4, 1, 128, 704, 704, 128, 4] + - [141, 63.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [121, 38.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 4] + - [117, 98.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [125, 37.0] + - - [4, 4, 1, 128, 4, 4, 128, 4] + - [114, 0.35] + - - [4, 128, 1, 256, 4, 4, 256, 128] + - [141, 27.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [117, 373.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 128] + - [128, 2945.0] + - - [4, 448, 1, 128, 4, 4, 128, 448] + - [100, 68.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [108, 1273.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 128] + - [99, 49.0] + - - [64, 4, 1, 128, 64, 64, 128, 4] + - [100, 6.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [125, 166.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 704] + - [96, 216.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 4] + - [90, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [130, 462.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 4] + - [121, 94.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [99, 396.0] + - - [4, 64, 1, 128, 4, 4, 128, 64] + - [97, 6.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [134, 607.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 128] + - [108, 1129.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [100, 1070.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 64] + - [108, 1114.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 4] + - [141, 274.0] + - - [704, 64, 1, 256, 704, 704, 256, 64] + - [119, 1317.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 448] + - [112, 2493.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [117, 749.0] + - - [448, 64, 1, 256, 448, 448, 256, 64] + - [135, 1014.0] + - - [4, 256, 1, 128, 4, 4, 128, 256] + - [90, 23.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 64] + - [103, 1773.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [121, 203.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 4] + - [99, 2.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1024] + - [96, 269.0] + - - [704, 4, 1, 256, 704, 704, 256, 4] + - [119, 103.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 4] + - [99, 49.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 4] + - [110, 165.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 4] + - [125, 231.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 128] + - [136, 2550.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 64] + - [101, 3017.0] + - - [4, 1024, 1, 128, 4, 4, 128, 1024] + - [118, 118.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [132, 1382.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [129, 1451.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [119, 713.0] + - - [128, 4, 1, 256, 128, 128, 256, 4] + - [106, 19.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 256] + - [138, 3007.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 128] + - [114, 1884.0] + - - [448, 4, 1, 128, 448, 448, 128, 4] + - [123, 69.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 256] + - [127, 104.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [90, 11.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 256] + - [134, 91.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 4] + - [99, 24.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 64] + - [117, 26.0] + - - [4, 1024, 1, 256, 4, 4, 256, 1024] + - [125, 144.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [100, 606.0] + - - [4, 64, 1, 256, 4, 4, 256, 64] + - [96, 9.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [139, 1079.0] + - - [64, 448, 1, 128, 64, 64, 128, 448] + - [126, 951.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [111, 2187.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 448] + - [128, 2757.0] + - - [4, 448, 1, 256, 4, 4, 256, 448] + - [96, 64.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 128] + - [127, 40.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [121, 197.0] + - - [64, 64, 1, 128, 64, 64, 128, 64] + - [113, 114.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 4] + - [110, 20.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 64] + - [98, 1296.0] + - - [128, 128, 1, 256, 128, 128, 256, 128] + - [98, 604.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB_GB.yaml new file mode 100644 index 000000000..65bf97806 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bjlk_SB_GB.yaml @@ -0,0 +1,38963 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 4 + LVCA: 2 + LVCB: 32 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 256 + LSPA: 64 + LSPB: 4 + LVCA: 4 + LVCB: 64 + LVPA: 16 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 4 + LVCB: 16 + LVPA: 4 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 2 + LVCB: 8 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 2 + LVCB: 16 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 64 + LSPB: 8 + LVCA: 2 + LVCB: 16 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 128 + LSPA: 128 + LSPB: 8 + LVCA: 2 + LVCB: 32 + LVPA: 32 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 32 + LSPB: 8 + LVCA: 4 + LVCB: 16 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 64 + LSPB: 8 + LVCA: 4 + LVCB: 32 + LVPA: 16 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 8 + LSPB: 4 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 16 + LSPB: 4 + LVCA: 8 + LVCB: 32 + LVPA: 16 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 64 + LSPA: 32 + LSPB: 4 + LVCA: 8 + LVCB: 64 + LVPA: 32 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 2 + LSPB: 4 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT64x8x16_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x32_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x32_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x32x32_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x8_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 8 + LSPB: 2 + LVCA: 8 + LVCB: 32 + LVPA: 8 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 4 + LSPB: 2 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 512 + LdsOffsetB_Blk: 4608 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bjlk_SB_GB_MT16x64x32_SN_SU32_SUM3_TT1_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 4288] + - [17, 12089.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 5888] + - [17, 11599.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1024] + - [17, 12009.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 1856] + - [17, 11974.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 704] + - [32, 10089.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 2944] + - [29, 12558.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 4288] + - [29, 10949.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 5056] + - [6, 10430.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 5056] + - [17, 12301.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 5888] + - [35, 12114.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3584] + - [5, 11605.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1408] + - [29, 12157.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 2368] + - [24, 9856.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1856] + - [29, 10733.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 5056] + - [17, 12257.0] + - - [448, 5056, 1, 256, 448, 448, 256, 5056] + - [15, 9302.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 1408] + - [20, 9460.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 256] + - [10, 10607.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 4288] + - [10, 12400.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 448] + - [34, 8801.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 2368] + - [3, 11142.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 2944] + - [17, 12107.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 5056] + - [3, 11008.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 704] + - [3, 10179.0] + - - [256, 5888, 1, 256, 256, 256, 256, 5888] + - [29, 9214.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 4288] + - [29, 11729.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 1024] + - [26, 11124.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 5056] + - [19, 10565.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 2944] + - [29, 11357.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 5056] + - [17, 12470.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 5056] + - [17, 11915.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 6784] + - [26, 11294.0] + - - [704, 5056, 1, 128, 704, 704, 128, 5056] + - [25, 9628.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 2944] + - [10, 12004.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 6784] + - [29, 12604.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 4288] + - [17, 11959.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 4288] + - [29, 12276.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 704] + - [29, 9821.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 4288] + - [17, 12272.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 2368] + - [29, 12043.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 448] + - [28, 10239.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 2944] + - [4, 10599.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 2944] + - [29, 11682.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 704] + - [17, 11077.0] + - - [448, 5888, 1, 128, 448, 448, 128, 5888] + - [6, 8834.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 2368] + - [29, 12041.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 3584] + - [29, 9628.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 5888] + - [17, 12427.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 2944] + - [17, 11675.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 1408] + - [32, 9801.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 2368] + - [17, 11293.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 6784] + - [7, 10574.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 704] + - [10, 11127.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 1856] + - [28, 9982.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 5056] + - [17, 11587.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 1856] + - [10, 11520.0] + - - [704, 5888, 1, 256, 704, 704, 256, 5888] + - [3, 10422.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 6784] + - [17, 12605.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 704] + - [3, 10454.0] + - - [448, 4288, 1, 256, 448, 448, 256, 4288] + - [14, 8652.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 2368] + - [10, 9850.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 2368] + - [29, 11037.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 4288] + - [17, 11652.0] + - - [704, 2944, 1, 128, 704, 704, 128, 2944] + - [4, 9129.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1024] + - [17, 10310.0] + - - [704, 6784, 1, 256, 704, 704, 256, 6784] + - [23, 10252.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 704] + - [10, 10199.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 1408] + - [21, 10715.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 4288] + - [17, 12333.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1856] + - [17, 11899.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 1024] + - [17, 12245.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 4288] + - [6, 9948.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 3584] + - [17, 11969.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 6784] + - [17, 12095.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3584] + - [17, 12306.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 2944] + - [10, 12270.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 2368] + - [17, 11817.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 2368] + - [26, 10144.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 6784] + - [17, 12541.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 5888] + - [17, 12110.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 4288] + - [29, 12238.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 5056] + - [17, 12223.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 5888] + - [26, 11802.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 5888] + - [10, 11112.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 3584] + - [7, 10807.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 5888] + - [29, 12645.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 5056] + - [17, 12028.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 1024] + - [17, 10765.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 2368] + - [35, 10914.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 448] + - [16, 9971.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 5888] + - [31, 11230.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 2944] + - [29, 11666.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 1024] + - [12, 11298.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 5056] + - [29, 12329.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 1856] + - [10, 10829.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 2368] + - [29, 10492.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 4288] + - [29, 12171.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 4288] + - [26, 10952.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 448] + - [25, 9719.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 6784] + - [3, 11310.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 5888] + - [17, 12605.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1024] + - [17, 12049.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 5888] + - [5, 9984.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 2944] + - [3, 11160.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 5888] + - [29, 12439.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 5888] + - [26, 11609.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 3584] + - [25, 10751.0] + - - [448, 3584, 1, 128, 448, 448, 128, 3584] + - [15, 8382.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 2944] + - [10, 12548.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 5888] + - [1, 11030.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 5888] + - [17, 12225.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 704] + - [25, 8996.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 2944] + - [17, 12243.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 2368] + - [32, 10717.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 704] + - [32, 10248.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 1408] + - [17, 12327.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 1024] + - [31, 12139.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 2944] + - [29, 12622.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 5056] + - [29, 12435.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 1856] + - [27, 9744.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 5888] + - [10, 10671.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 2368] + - [29, 12101.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 5888] + - [17, 12329.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 4288] + - [29, 11826.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1856] + - [3, 11675.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 2944] + - [23, 11594.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 6784] + - [29, 10658.0] + - - [256, 5056, 1, 128, 256, 256, 128, 5056] + - [25, 8487.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 1024] + - [21, 11321.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 1856] + - [17, 11970.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 1408] + - [22, 9758.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 1408] + - [27, 10680.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 5056] + - [10, 11901.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 256] + - [5, 11566.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 5888] + - [3, 12009.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 2368] + - [29, 11362.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 3584] + - [23, 12028.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1024] + - [12, 11299.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 4288] + - [29, 12341.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1856] + - [17, 10664.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 2944] + - [17, 11901.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 1856] + - [25, 11058.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 1024] + - [8, 9711.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 3584] + - [33, 11618.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 5888] + - [5, 12596.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 1024] + - [29, 11261.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 2368] + - [29, 11411.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 5888] + - [17, 12481.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 1024] + - [22, 9746.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 448] + - [9, 10064.0] + - - [448, 5888, 1, 256, 448, 448, 256, 5888] + - [21, 9296.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 6784] + - [29, 12255.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 704] + - [25, 9401.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 2944] + - [29, 11922.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 5888] + - [17, 12040.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 1856] + - [26, 10313.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3584] + - [10, 12014.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 6784] + - [23, 11725.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1408] + - [17, 12245.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 4288] + - [21, 11375.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 1856] + - [25, 10286.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 5888] + - [19, 12257.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 6784] + - [29, 12433.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 2368] + - [7, 10759.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 4288] + - [1, 10710.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 2944] + - [10, 11538.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1856] + - [3, 12002.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 6784] + - [29, 12098.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 5056] + - [10, 12155.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 448] + - [28, 9308.0] + - - [448, 4288, 1, 128, 448, 448, 128, 4288] + - [18, 8622.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 4288] + - [17, 12181.0] + - - [256, 6784, 1, 256, 256, 256, 256, 6784] + - [20, 9619.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 4288] + - [25, 10754.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 704] + - [3, 10912.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 3584] + - [29, 11927.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 5056] + - [10, 12082.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 2368] + - [29, 12016.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 3584] + - [17, 11402.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 3584] + - [29, 12301.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 2368] + - [3, 10961.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 2944] + - [17, 12316.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 6784] + - [29, 12185.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 3584] + - [18, 10595.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 448] + - [28, 9954.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 6784] + - [17, 11845.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 1856] + - [10, 11372.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 1856] + - [17, 10162.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 6784] + - [17, 12229.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 3584] + - [17, 12369.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 5888] + - [17, 11982.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 5888] + - [26, 11686.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 2368] + - [29, 12093.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 4288] + - [29, 11631.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 3584] + - [29, 11945.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 6784] + - [17, 12146.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1408] + - [29, 10587.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 704] + - [25, 10461.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 5888] + - [29, 11788.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 2944] + - [3, 10682.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 2368] + - [25, 9580.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 704] + - [17, 10214.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 6784] + - [17, 11877.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 448] + - [9, 10158.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 2368] + - [5, 10302.0] + - - [256, 5888, 1, 128, 256, 256, 128, 5888] + - [2, 8032.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 2944] + - [29, 12276.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 2368] + - [17, 11748.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 704] + - [29, 11320.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 4288] + - [23, 11572.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 2944] + - [17, 12065.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 704] + - [32, 9340.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 5056] + - [29, 11441.0] + - - [448, 5056, 1, 128, 448, 448, 128, 5056] + - [13, 8765.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 5056] + - [21, 11099.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 3584] + - [21, 11591.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 2368] + - [35, 11578.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 5056] + - [17, 12461.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 5056] + - [14, 10928.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3584] + - [29, 12386.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 6784] + - [17, 12409.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 2944] + - [29, 12178.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 704] + - [23, 10153.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 5056] + - [5, 11184.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 5888] + - [17, 12593.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 5888] + - [17, 12609.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 4288] + - [10, 12319.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 3584] + - [35, 11745.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 1856] + - [25, 9994.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 1024] + - [3, 11616.0] + - - [704, 3584, 1, 128, 704, 704, 128, 3584] + - [0, 9227.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 448] + - [28, 10041.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 4288] + - [29, 11772.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 2944] + - [10, 11278.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 6784] + - [17, 12102.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 2944] + - [17, 12430.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 3584] + - [29, 11756.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 1408] + - [10, 10580.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 3584] + - [17, 11200.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 704] + - [28, 9172.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 1408] + - [17, 11509.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 3584] + - [29, 10343.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 6784] + - [17, 12568.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 6784] + - [10, 11444.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 448] + - [9, 10147.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 4288] + - [10, 11612.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 704] + - [3, 11184.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 1024] + - [29, 12092.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 6784] + - [26, 11163.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 5056] + - [29, 11962.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 3584] + - [6, 9982.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 1408] + - [0, 8697.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 2944] + - [1, 10785.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 2944] + - [26, 11412.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 5056] + - [23, 12469.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 2368] + - [25, 9879.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 2368] + - [10, 11129.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 6784] + - [29, 12514.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 5888] + - [17, 11437.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 4288] + - [17, 12202.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1408] + - [29, 11937.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 5056] + - [35, 11421.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 2368] + - [17, 11408.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 5056] + - [35, 12399.0] + - - [448, 6784, 1, 256, 448, 448, 256, 6784] + - [7, 9697.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 2368] + - [35, 12139.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 1856] + - [10, 11708.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 448] + - [20, 8806.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1024] + - [29, 11475.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 5056] + - [17, 11408.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 4288] + - [17, 10749.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3584] + - [29, 12549.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3584] + - [17, 12465.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1408] + - [29, 11178.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 2944] + - [29, 12389.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 6784] + - [3, 11419.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 1408] + - [33, 10658.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 6784] + - [29, 12496.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 4288] + - [23, 11774.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 6784] + - [17, 12098.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 4288] + - [10, 12059.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 1408] + - [29, 11513.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 1024] + - [35, 11007.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 5888] + - [35, 12384.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1024] + - [3, 11683.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 6784] + - [3, 10395.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1408] + - [17, 11927.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 1856] + - [17, 11596.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 2944] + - [34, 10482.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 5888] + - [17, 12503.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1856] + - [35, 12035.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 5056] + - [10, 11868.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 256] + - [23, 10877.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 5888] + - [17, 12003.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 1408] + - [23, 10898.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3584] + - [3, 10492.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 448] + - [29, 10114.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 1856] + - [29, 12065.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 1024] + - [27, 10472.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 2368] + - [6, 9428.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 4288] + - [17, 11843.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1408] + - [3, 10136.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 5056] + - [10, 12209.0] + - - [448, 6784, 1, 128, 448, 448, 128, 6784] + - [15, 8997.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 6784] + - [29, 12348.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 2368] + - [10, 11249.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 3584] + - [29, 11990.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1408] + - [10, 11926.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 448] + - [25, 8883.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 256] + - [37, 11088.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 1408] + - [10, 10809.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 4288] + - [17, 11866.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 448] + - [20, 9658.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 2368] + - [17, 12095.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1856] + - [3, 11096.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 1856] + - [10, 11652.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 5888] + - [10, 11569.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 2368] + - [29, 10527.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 1408] + - [15, 9794.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 2368] + - [33, 10268.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 1408] + - [21, 11493.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 5888] + - [29, 12046.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 5056] + - [17, 11605.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 3584] + - [33, 11504.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 5056] + - [10, 12426.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 1024] + - [22, 8866.0] + - - [704, 4288, 1, 256, 704, 704, 256, 4288] + - [7, 9626.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 2368] + - [17, 12149.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 5888] + - [17, 12135.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 256] + - [17, 10683.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 1856] + - [17, 11171.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 704] + - [27, 9305.0] + - - [704, 3584, 1, 256, 704, 704, 256, 3584] + - [1, 9206.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 2944] + - [3, 10935.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 1024] + - [33, 10726.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 1024] + - [5, 11342.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 5056] + - [33, 11319.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 6784] + - [23, 11828.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 1408] + - [35, 12275.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 6784] + - [26, 11682.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 2944] + - [29, 12599.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 1856] + - [33, 10552.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 2944] + - [3, 10533.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 448] + - [25, 10341.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 5056] + - [5, 10431.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 1856] + - [32, 9894.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 704] + - [9, 9701.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 704] + - [10, 10402.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 1024] + - [25, 9961.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 5888] + - [29, 10946.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 4288] + - [17, 12100.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 4288] + - [10, 12104.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [22, 9784.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1024] + - [29, 12063.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 1024] + - [27, 10645.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 704] + - [17, 10683.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 3584] + - [17, 12517.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 6784] + - [3, 11988.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 2944] + - [29, 12001.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 2368] + - [3, 11450.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 1856] + - [25, 10607.0] + - - [256, 6784, 1, 128, 256, 256, 128, 6784] + - [30, 9066.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 6784] + - [26, 11735.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 5056] + - [10, 11397.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 5888] + - [21, 11002.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 5888] + - [35, 12327.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 1856] + - [10, 11412.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 3584] + - [17, 12313.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 4288] + - [3, 10376.0] + - - [704, 5888, 1, 128, 704, 704, 128, 5888] + - [7, 9302.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 3584] + - [29, 11894.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 5056] + - [17, 12294.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 1408] + - [13, 9148.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 2368] + - [10, 11562.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 704] + - [3, 10332.0] + - - [448, 3584, 1, 256, 448, 448, 256, 3584] + - [17, 8437.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1024] + - [29, 10858.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 1408] + - [17, 11941.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 1408] + - [29, 10435.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 5888] + - [17, 12554.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 3584] + - [17, 12219.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 1856] + - [3, 10893.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1408] + - [17, 12211.0] + - - [704, 2944, 1, 256, 704, 704, 256, 2944] + - [2, 9917.0] + - - [704, 4288, 1, 128, 704, 704, 128, 4288] + - [25, 9209.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 4288] + - [33, 10867.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 6784] + - [10, 10849.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1408] + - [3, 10456.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 4288] + - [3, 10108.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 1408] + - [10, 9969.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 5056] + - [10, 11672.0] + - - [704, 2368, 1, 256, 704, 704, 256, 2368] + - [15, 9107.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 2368] + - [3, 12201.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 448] + - [9, 9968.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 704] + - [19, 11212.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 256] + - [8, 8822.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 5888] + - [33, 11189.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 1024] + - [27, 9498.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 1856] + - [6, 9222.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 6784] + - [17, 12458.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 5056] + - [31, 11196.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 5056] + - [17, 12451.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 2944] + - [29, 12098.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 3584] + - [10, 11267.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 3584] + - [29, 12403.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 2944] + - [17, 11431.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 2368] + - [29, 11987.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 1408] + - [29, 11398.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 1408] + - [17, 12098.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 1024] + - [25, 9874.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 5056] + - [29, 12244.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 6784] + - [17, 12622.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 5056] + - [29, 11737.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 1408] + - [3, 12248.0] + - - [256, 5056, 1, 256, 256, 256, 256, 5056] + - [20, 9500.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3584] + - [19, 9970.0] + - - [704, 2368, 1, 128, 704, 704, 128, 2368] + - [9, 7737.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 256] + - [36, 8148.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 1856] + - [15, 10371.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 4288] + - [10, 11233.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 1024] + - [5, 11672.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 5056] + - [23, 11095.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 1408] + - [29, 11366.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 448] + - [28, 9506.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 6784] + - [17, 12022.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 5056] + - [3, 12468.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 704] + - [17, 10939.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 6784] + - [29, 12352.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 1408] + - [17, 11638.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 5888] + - [29, 11794.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 5888] + - [29, 12149.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 1024] + - [4, 10041.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 1856] + - [23, 10974.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 6784] + - [35, 12058.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 3584] + - [35, 11877.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1856] + - [3, 11895.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 1024] + - [3, 10526.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 3584] + - [3, 12491.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3584] + - [29, 12071.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 2944] + - [3, 10274.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 6784] + - [1, 10338.0] + - - [704, 5056, 1, 256, 704, 704, 256, 5056] + - [7, 10320.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1856] + - [17, 11531.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [11, 10127.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3584] + - [23, 12037.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 5888] + - [29, 12552.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 3584] + - [29, 12247.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 5888] + - [3, 11962.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 448] + - [29, 10477.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 1408] + - [17, 11194.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 1408] + - [29, 11135.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 2368] + - [29, 11799.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 2368] + - [23, 11803.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 1856] + - [10, 11258.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 2944] + - [29, 10973.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 1024] + - [7, 10893.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 256] + - [15, 9340.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 704] + - [25, 10307.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 4288] + - [17, 11719.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 5056] + - [29, 12098.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 1024] + - [23, 11861.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 5056] + - [19, 12251.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 1856] + - [3, 11386.0] + - - [704, 6784, 1, 128, 704, 704, 128, 6784] + - [15, 9454.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 6784] + - [29, 12082.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 2944] + - [29, 12322.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 2944] + - [17, 12224.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 6784] + - [21, 11353.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 4288] + - [17, 10002.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 4288] + - [10, 11876.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 3584] + - [3, 10824.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 5056] + - [17, 10825.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 6784] + - [17, 12482.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 5888] + - [26, 9753.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 448] + - [25, 8517.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 2944] + - [10, 11257.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 256] + - [17, 10492.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 5888] + - [17, 12209.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1856] + - [10, 10986.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 4288] + - [10, 11227.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 4288] + - [10, 12368.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 704] + - [8, 8695.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 1408] + - [36, 10520.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 5056] + - [17, 11927.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1024] + - [17, 11279.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 4288] + - [29, 11379.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 2368] + - [5, 11299.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 1856] + - [29, 12120.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 2944] + - [6, 10091.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 3584] + - [3, 11497.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 5888] + - [17, 12375.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 2944] + - [10, 11318.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 6784] + - [29, 12272.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 6784] + - [3, 10488.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 3584] + - [26, 11292.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 1856] + - [25, 11113.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 704] + - [25, 9647.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 5888] + - [3, 10833.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 6784] + - [17, 12405.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 1408] + - [20, 9616.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 1024] + - [22, 8847.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3584] + - [29, 12536.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 5056] + - [17, 12027.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 2368] + - [17, 11315.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 4288] + - [17, 11284.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 2944] + - [29, 11887.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [10, 9729.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 5056] + - [5, 11598.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 1856] + - [32, 10700.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 1408] + - [17, 11779.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3584] + - [29, 12345.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 2368] + - [17, 10951.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 6784] + - [23, 12631.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 4288] + - [26, 10176.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 448] + - [17, 10392.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 1024] + - [22, 10101.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [29, 12308.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [29, 12106.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [29, 12599.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [35, 12412.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [29, 11776.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [29, 12441.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [35, 12465.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [31, 12324.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [25, 9271.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [31, 10759.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [10, 11600.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [29, 11908.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [3, 12056.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [29, 12424.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [17, 12593.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [23, 12509.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [17, 12602.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [17, 12551.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [35, 12587.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [29, 12620.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [37, 12398.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [17, 12495.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [19, 12542.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [17, 12369.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [23, 12368.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 448] + - [50, 9325.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 6784] + - [68, 9788.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 448] + - [46, 7759.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 4288] + - [57, 9978.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 1856] + - [72, 9099.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1024] + - [69, 8415.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 1408] + - [57, 7231.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1856] + - [58, 8897.0] + - - [128, 5056, 1, 128, 128, 128, 128, 5056] + - [73, 6713.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 128] + - [49, 6100.0] + - - [64, 5056, 1, 256, 64, 64, 256, 5056] + - [39, 6257.0] + - - [256, 2944, 1, 256, 256, 256, 256, 2944] + - [71, 7404.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1856] + - [43, 8646.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 3584] + - [57, 8498.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [79, 7759.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [47, 4393.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 64] + - [80, 5486.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 256] + - [53, 8929.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 448] + - [74, 9007.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 704] + - [59, 8870.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 64] + - [51, 5893.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [77, 7566.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 1408] + - [77, 8738.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 256] + - [64, 6440.0] + - - [448, 2944, 1, 128, 448, 448, 128, 2944] + - [76, 7773.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 128] + - [61, 8632.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 128] + - [48, 5365.0] + - - [448, 1408, 1, 256, 448, 448, 256, 1408] + - [73, 7133.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 5056] + - [59, 8555.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 448] + - [69, 5010.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3584] + - [41, 10393.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 64] + - [51, 7736.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 704] + - [48, 7382.0] + - - [128, 4288, 1, 128, 128, 128, 128, 4288] + - [40, 5121.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 256] + - [64, 6941.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 128] + - [65, 8576.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 64] + - [52, 5215.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 256] + - [64, 8477.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 5888] + - [42, 7328.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1024] + - [64, 8544.0] + - - [448, 1856, 1, 128, 448, 448, 128, 1856] + - [76, 6320.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 704] + - [60, 8309.0] + - - [128, 5888, 1, 256, 128, 128, 256, 5888] + - [57, 7519.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 704] + - [72, 8222.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1408] + - [64, 8681.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 256] + - [54, 10191.0] + - - [704, 1856, 1, 128, 704, 704, 128, 1856] + - [76, 7500.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3584] + - [40, 9150.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 2944] + - [40, 7091.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 128] + - [62, 6439.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 1408] + - [59, 9158.0] + - - [256, 3584, 1, 256, 256, 256, 256, 3584] + - [72, 8383.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 2944] + - [48, 9232.0] + - - [448, 2368, 1, 128, 448, 448, 128, 2368] + - [69, 6886.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 704] + - [51, 7570.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 2944] + - [41, 9167.0] + - - [64, 5888, 1, 256, 64, 64, 256, 5888] + - [73, 5336.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 128] + - [83, 9677.0] + - - [704, 704, 1, 256, 704, 704, 256, 704] + - [50, 6836.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 4288] + - [50, 9222.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 704] + - [73, 7634.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 448] + - [74, 8814.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 704] + - [53, 8891.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1408] + - [59, 8770.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1024] + - [60, 9030.0] + - - [448, 1024, 1, 128, 448, 448, 128, 1024] + - [62, 5133.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 2368] + - [58, 9138.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 64] + - [47, 4226.0] + - - [704, 1024, 1, 256, 704, 704, 256, 1024] + - [48, 7220.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 6784] + - [72, 9523.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [62, 6720.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 4288] + - [53, 9375.0] + - - [256, 1856, 1, 128, 256, 256, 128, 1856] + - [40, 5243.0] + - - [448, 1408, 1, 128, 448, 448, 128, 1408] + - [62, 6107.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 128] + - [54, 8060.0] + - - [704, 448, 1, 256, 704, 704, 256, 448] + - [62, 5448.0] + - - [704, 1408, 1, 128, 704, 704, 128, 1408] + - [55, 7064.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 448] + - [71, 7758.0] + - - [128, 2944, 1, 128, 128, 128, 128, 2944] + - [56, 4458.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 704] + - [74, 8605.0] + - - [128, 4288, 1, 256, 128, 128, 256, 4288] + - [57, 6445.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 448] + - [42, 8225.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 1024] + - [53, 9269.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 2368] + - [41, 8874.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 6784] + - [73, 8334.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 256] + - [53, 8703.0] + - - [256, 2368, 1, 128, 256, 256, 128, 2368] + - [57, 5535.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 704] + - [62, 7826.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 448] + - [74, 9410.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [62, 7028.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 448] + - [72, 8909.0] + - - [128, 5888, 1, 128, 128, 128, 128, 5888] + - [69, 6184.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] + - [48, 8249.0] + - - [704, 1856, 1, 256, 704, 704, 256, 1856] + - [72, 7915.0] + - - [64, 6784, 1, 256, 64, 64, 256, 6784] + - [42, 5937.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 2368] + - [41, 9320.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 448] + - [74, 7959.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 448] + - [68, 6474.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 128] + - [50, 7722.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 256] + - [82, 6129.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 5056] + - [73, 7862.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 256] + - [50, 9463.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 448] + - [74, 8990.0] + - - [128, 3584, 1, 256, 128, 128, 256, 3584] + - [77, 7205.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 448] + - [73, 7956.0] + - - [128, 5056, 1, 256, 128, 128, 256, 5056] + - [50, 8276.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 256] + - [60, 9257.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 128] + - [65, 8909.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 256] + - [56, 4233.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1408] + - [71, 6829.0] + - - [128, 2368, 1, 256, 128, 128, 256, 2368] + - [77, 5449.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 64] + - [53, 8152.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 2944] + - [48, 7557.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 448] + - [81, 9182.0] + - - [256, 4288, 1, 256, 256, 256, 256, 4288] + - [60, 7997.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 128] + - [77, 7259.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 448] + - [60, 9320.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 64] + - [47, 5344.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 704] + - [47, 6462.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 5056] + - [41, 10635.0] + - - [704, 1024, 1, 128, 704, 704, 128, 1024] + - [69, 6277.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 128] + - [50, 6307.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 448] + - [71, 6034.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 5888] + - [57, 9009.0] + - - [704, 448, 1, 128, 704, 704, 128, 448] + - [70, 4295.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [54, 8265.0] + - - [128, 2944, 1, 256, 128, 128, 256, 2944] + - [40, 5494.0] + - - [128, 6784, 1, 128, 128, 128, 128, 6784] + - [61, 6515.0] + - - [448, 1856, 1, 256, 448, 448, 256, 1856] + - [65, 7485.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 128] + - [60, 8742.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 448] + - [74, 8327.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 128] + - [54, 6584.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 704] + - [67, 8593.0] + - - [448, 2944, 1, 256, 448, 448, 256, 2944] + - [62, 8012.0] + - - [448, 2368, 1, 256, 448, 448, 256, 2368] + - [58, 7711.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 2368] + - [54, 8821.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 128] + - [79, 9921.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 64] + - [51, 8582.0] + - - [64, 5888, 1, 128, 64, 64, 128, 5888] + - [52, 4491.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 128] + - [50, 10504.0] + - - [448, 704, 1, 256, 448, 448, 256, 704] + - [47, 5347.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 128] + - [67, 7285.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 5056] + - [41, 9822.0] + - - [704, 704, 1, 128, 704, 704, 128, 704] + - [38, 4948.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 6784] + - [73, 7827.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [78, 4119.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [62, 6109.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [53, 8848.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 1024] + - [59, 8697.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 2368] + - [41, 9958.0] + - - [256, 3584, 1, 128, 256, 256, 128, 3584] + - [44, 7871.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 256] + - [55, 8516.0] + - - [256, 1856, 1, 256, 256, 256, 256, 1856] + - [71, 7462.0] + - - [256, 2944, 1, 128, 256, 256, 128, 2944] + - [55, 6950.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 256] + - [55, 7128.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 448] + - [64, 7777.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 256] + - [53, 9620.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 704] + - [69, 7845.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [76, 6141.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 448] + - [74, 8164.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 64] + - [53, 7797.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [46, 5036.0] + - - [256, 2368, 1, 256, 256, 256, 256, 2368] + - [41, 6891.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 448] + - [60, 9035.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1856] + - [58, 9094.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 256] + - [55, 5288.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 128] + - [65, 7476.0] + - - [448, 1024, 1, 256, 448, 448, 256, 1024] + - [73, 7144.0] + - - [64, 6784, 1, 128, 64, 64, 128, 6784] + - [73, 5513.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 64] + - [66, 7031.0] + - - [128, 3584, 1, 128, 128, 128, 128, 3584] + - [71, 6425.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [64, 6119.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 5888] + - [43, 9071.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 256] + - [55, 6789.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 1024] + - [58, 7239.0] + - - [64, 5056, 1, 128, 64, 64, 128, 5056] + - [45, 5436.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 64] + - [63, 5395.0] + - - [448, 704, 1, 128, 448, 448, 128, 704] + - [56, 5515.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 704] + - [71, 7018.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 256] + - [65, 9775.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 128] + - [53, 8569.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 3584] + - [41, 10098.0] + - - [256, 1408, 1, 128, 256, 256, 128, 1408] + - [75, 4479.0] + - - [256, 4288, 1, 128, 256, 256, 128, 4288] + - [69, 7235.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [38, 6297.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 256] + - [74, 9160.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 5888] + - [66, 7144.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 64] + - [47, 5417.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 704] + - [50, 7649.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 2368] + - [53, 7816.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 256] + - [54, 9939.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 128] + - [81, 8527.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 1856] + - [72, 9416.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 448] + - [48, 6510.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 128] + - [74, 6924.0] + - - [128, 2368, 1, 128, 128, 128, 128, 2368] + - [39, 4067.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 2944] + - [55, 8914.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 1024] + - [64, 8926.0] + - - [128, 6784, 1, 256, 128, 128, 256, 6784] + - [54, 8048.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 1856] + - [43, 9126.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [69, 6753.0] + - - [704, 1408, 1, 256, 704, 704, 256, 1408] + - [77, 7905.0] + - - [256, 1408, 1, 256, 256, 256, 256, 1408] + - [39, 5870.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 2944] + - [72, 8967.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 128] + - [54, 9413.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 448] + - [58, 7275.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 4288] + - [72, 8536.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 704] + - [42, 8214.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 704] + - [74, 9052.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [51, 8369.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [41, 8701.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [106, 2769.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 64] + - [97, 4505.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 64] + - [105, 1786.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 64] + - [93, 2954.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 64] + - [133, 2959.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 256] + - [115, 3986.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 64] + - [95, 3561.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 128] + - [119, 2997.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 64] + - [115, 4129.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 64] + - [115, 3346.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3584] + - [106, 4541.0] + - - [704, 256, 1, 128, 704, 704, 128, 256] + - [129, 2408.0] + - - [128, 1408, 1, 128, 128, 128, 128, 1408] + - [122, 2364.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 64] + - [115, 4002.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [87, 3286.0] + - - [448, 448, 1, 256, 448, 448, 256, 448] + - [115, 3172.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 1024] + - [122, 3794.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1856] + - [86, 3596.0] + - - [256, 1024, 1, 256, 256, 256, 256, 1024] + - [122, 3306.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 128] + - [97, 3596.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 256] + - [120, 3766.0] + - - [128, 1024, 1, 128, 128, 128, 128, 1024] + - [122, 2046.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 704] + - [119, 2993.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 128] + - [133, 4689.0] + - - [64, 2944, 1, 128, 64, 64, 128, 2944] + - [89, 2599.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 448] + - [106, 4021.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 128] + - [119, 3479.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1856] + - [122, 4467.0] + - - [256, 448, 1, 256, 256, 256, 256, 448] + - [114, 2497.0] + - - [128, 1856, 1, 128, 128, 128, 128, 1856] + - [122, 2964.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 1408] + - [85, 3171.0] + - - [128, 1408, 1, 256, 128, 128, 256, 1408] + - [122, 2859.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 64] + - [133, 3584.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 448] + - [104, 3758.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 2368] + - [87, 4142.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 64] + - [133, 3154.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [107, 2428.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 256] + - [133, 2764.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 64] + - [97, 2355.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 64] + - [95, 2603.0] + - - [704, 128, 1, 256, 704, 704, 256, 128] + - [112, 2038.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 256] + - [120, 3502.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 128] + - [133, 4501.0] + - - [64, 3584, 1, 256, 64, 64, 256, 3584] + - [87, 3529.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 64] + - [97, 2807.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1024] + - [106, 3971.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 64] + - [97, 4316.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 1856] + - [87, 4641.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 2944] + - [120, 3710.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 4288] + - [97, 4124.0] + - - [64, 1856, 1, 256, 64, 64, 256, 1856] + - [131, 2644.0] + - - [256, 704, 1, 256, 256, 256, 256, 704] + - [115, 2891.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 64] + - [115, 2389.0] + - - [64, 1408, 1, 128, 64, 64, 128, 1408] + - [85, 1522.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 256] + - [103, 3632.0] + - - [64, 2944, 1, 256, 64, 64, 256, 2944] + - [122, 2963.0] + - - [448, 256, 1, 128, 448, 448, 128, 256] + - [85, 1892.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 128] + - [103, 3250.0] + - - [128, 704, 1, 128, 128, 128, 128, 704] + - [112, 1530.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 448] + - [104, 3525.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 256] + - [103, 3540.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 2368] + - [122, 4476.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 64] + - [95, 2715.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [112, 2160.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 704] + - [103, 3634.0] + - - [256, 448, 1, 128, 256, 256, 128, 448] + - [95, 1849.0] + - - [64, 3584, 1, 128, 64, 64, 128, 3584] + - [106, 2856.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 128] + - [95, 2923.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 64] + - [106, 3620.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 1408] + - [85, 3580.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 64] + - [95, 2067.0] + - - [64, 1856, 1, 128, 64, 64, 128, 1856] + - [92, 1954.0] + - - [64, 2368, 1, 256, 64, 64, 256, 2368] + - [122, 3060.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 128] + - [122, 3886.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [97, 2993.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 64] + - [97, 4143.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1024] + - [115, 3743.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 4288] + - [106, 3989.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 64] + - [112, 3234.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 2944] + - [122, 3577.0] + - - [256, 704, 1, 128, 256, 256, 128, 704] + - [86, 3012.0] + - - [256, 1024, 1, 128, 256, 256, 128, 1024] + - [97, 3495.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1408] + - [112, 3119.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 448] + - [133, 3997.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 256] + - [122, 3886.0] + - - [128, 1024, 1, 256, 128, 128, 256, 1024] + - [133, 3349.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 64] + - [97, 4593.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 128] + - [115, 3334.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 1024] + - [97, 3992.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 64] + - [114, 3905.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [114, 3137.0] + - - [128, 704, 1, 256, 128, 128, 256, 704] + - [93, 2639.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 3584] + - [122, 4457.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 64] + - [97, 4005.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 1856] + - [86, 3841.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 128] + - [103, 3644.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 704] + - [112, 3267.0] + - - [128, 1856, 1, 256, 128, 128, 256, 1856] + - [133, 3668.0] + - - [64, 4288, 1, 256, 64, 64, 256, 4288] + - [133, 3404.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 704] + - [119, 3538.0] + - - [64, 2368, 1, 128, 64, 64, 128, 2368] + - [97, 2443.0] + - - [64, 4288, 1, 128, 64, 64, 128, 4288] + - [97, 2962.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 128] + - [115, 3722.0] + - - [64, 1408, 1, 256, 64, 64, 256, 1408] + - [93, 2105.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 64] + - [133, 3755.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1408] + - [97, 3503.0] + - - [448, 448, 1, 128, 448, 448, 128, 448] + - [115, 2590.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [93, 2855.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [97, 3673.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [122, 3513.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 4] + - [147, 640.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 4] + - [139, 437.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 4] + - [139, 534.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 4] + - [147, 624.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 4] + - [147, 403.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 4] + - [142, 239.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 4] + - [147, 276.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 4] + - [147, 394.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 4] + - [142, 656.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 4] + - [139, 369.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 4] + - [139, 299.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 4] + - [139, 188.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 4] + - [143, 621.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 4] + - [113, 310.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 4] + - [142, 405.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 4] + - [145, 400.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 4] + - [142, 343.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 4] + - [139, 592.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 4] + - [139, 570.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 4] + - [139, 192.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 4] + - [139, 157.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 4] + - [139, 324.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 4] + - [140, 452.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 4] + - [146, 376.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 4] + - [130, 417.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 4] + - [144, 436.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 4] + - [139, 518.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 4] + - [141, 551.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 4] + - [147, 615.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 4] + - [147, 335.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 4] + - [108, 167.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 4] + - [148, 481.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 4] + - [142, 637.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 4] + - [142, 552.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 4] + - [140, 502.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 4] + - [143, 711.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 1856] + - [155, 706.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 2944] + - [155, 766.0] + - - [4, 1408, 1, 128, 4, 4, 128, 1408] + - [111, 215.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 2368] + - [155, 748.0] + - - [4, 3584, 1, 128, 4, 4, 128, 3584] + - [150, 485.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 5888] + - [153, 535.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 1408] + - [158, 545.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 6784] + - [152, 910.0] + - - [4, 4288, 1, 128, 4, 4, 128, 4288] + - [156, 543.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 5056] + - [153, 971.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 6784] + - [154, 875.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 2944] + - [155, 825.0] + - - [4, 5056, 1, 256, 4, 4, 256, 5056] + - [160, 591.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 5056] + - [149, 893.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 2368] + - [155, 784.0] + - - [4, 1856, 1, 256, 4, 4, 256, 1856] + - [123, 249.0] + - - [4, 2368, 1, 256, 4, 4, 256, 2368] + - [150, 305.0] + - - [4, 2944, 1, 256, 4, 4, 256, 2944] + - [163, 376.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 4288] + - [157, 798.0] + - - [4, 6784, 1, 128, 4, 4, 128, 6784] + - [156, 461.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 3584] + - [150, 786.0] + - - [4, 5888, 1, 256, 4, 4, 256, 5888] + - [149, 616.0] + - - [4, 6784, 1, 256, 4, 4, 256, 6784] + - [162, 774.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1408] + - [151, 423.0] + - - [4, 3584, 1, 256, 4, 4, 256, 3584] + - [150, 455.0] + - - [4, 1408, 1, 256, 4, 4, 256, 1408] + - [116, 195.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 4288] + - [161, 873.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 5888] + - [156, 890.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1856] + - [151, 555.0] + - - [4, 1856, 1, 128, 4, 4, 128, 1856] + - [164, 219.0] + - - [4, 2944, 1, 128, 4, 4, 128, 2944] + - [159, 260.0] + - - [4, 5056, 1, 128, 4, 4, 128, 5056] + - [158, 533.0] + - - [4, 4288, 1, 256, 4, 4, 256, 4288] + - [160, 699.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3584] + - [154, 916.0] + - - [4, 5888, 1, 128, 4, 4, 128, 5888] + - [153, 691.0] + - - [4, 2368, 1, 128, 4, 4, 128, 2368] + - [163, 317.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 704] + - [88, 217.0] + - - [128, 64, 1, 256, 128, 128, 256, 64] + - [130, 502.0] + - - [64, 4, 1, 256, 64, 64, 256, 4] + - [98, 14.0] + - - [64, 704, 1, 128, 64, 64, 128, 704] + - [84, 952.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 64] + - [92, 1638.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 4] + - [91, 39.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1024] + - [129, 2727.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 704] + - [118, 1991.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 64] + - [114, 1216.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 1024] + - [126, 3117.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 64] + - [93, 2792.0] + - - [4, 704, 1, 256, 4, 4, 256, 704] + - [134, 101.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 4] + - [98, 204.0] + - - [64, 448, 1, 256, 64, 64, 256, 448] + - [128, 1046.0] + - - [64, 1024, 1, 128, 64, 64, 128, 1024] + - [97, 1192.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 64] + - [91, 20.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 256] + - [121, 1864.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 448] + - [134, 1625.0] + - - [448, 4, 1, 256, 448, 448, 256, 4] + - [111, 65.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 4] + - [134, 137.0] + - - [128, 4, 1, 128, 128, 128, 128, 4] + - [84, 11.0] + - - [256, 4, 1, 128, 256, 256, 128, 4] + - [96, 23.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 64] + - [93, 2162.0] + - - [64, 128, 1, 256, 64, 64, 256, 128] + - [116, 312.0] + - - [704, 64, 1, 128, 704, 704, 128, 64] + - [134, 901.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 4] + - [134, 148.0] + - - [256, 256, 1, 128, 256, 256, 128, 256] + - [95, 1198.0] + - - [64, 256, 1, 128, 64, 64, 128, 256] + - [100, 624.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 64] + - [118, 1954.0] + - - [128, 448, 1, 256, 128, 128, 256, 448] + - [129, 1572.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 256] + - [132, 1704.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 64] + - [123, 1804.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [103, 900.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 128] + - [101, 729.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 128] + - [90, 1301.0] + - - [256, 128, 1, 256, 256, 256, 256, 128] + - [111, 1081.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 448] + - [88, 1803.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 256] + - [85, 3121.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 4] + - [116, 310.0] + - - [4, 4, 1, 256, 4, 4, 256, 4] + - [84, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 64] + - [117, 587.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 128] + - [92, 1729.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 64] + - [101, 611.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 448] + - [91, 164.0] + - - [64, 1024, 1, 256, 64, 64, 256, 1024] + - [129, 1708.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 4] + - [110, 78.0] + - - [64, 704, 1, 256, 64, 64, 256, 704] + - [118, 1293.0] + - - [4, 704, 1, 128, 4, 4, 128, 704] + - [92, 62.0] + - - [448, 128, 1, 256, 448, 448, 256, 128] + - [99, 1803.0] + - - [448, 64, 1, 128, 448, 448, 128, 64] + - [128, 642.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 448] + - [101, 129.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [133, 1820.0] + - - [256, 64, 1, 128, 256, 256, 128, 64] + - [84, 576.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 1024] + - [123, 309.0] + - - [704, 4, 1, 128, 704, 704, 128, 4] + - [92, 62.0] + - - [256, 4, 1, 256, 256, 256, 256, 4] + - [111, 37.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 4] + - [91, 97.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [85, 37.0] + - - [4, 4, 1, 128, 4, 4, 128, 4] + - [88, 1.0] + - - [4, 128, 1, 256, 4, 4, 256, 128] + - [88, 18.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 64] + - [127, 318.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 128] + - [119, 2906.0] + - - [4, 448, 1, 128, 4, 4, 128, 448] + - [94, 41.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 256] + - [136, 1125.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 128] + - [91, 49.0] + - - [64, 4, 1, 128, 64, 64, 128, 4] + - [88, 6.0] + - - [64, 64, 1, 256, 64, 64, 256, 64] + - [128, 153.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 704] + - [88, 216.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 4] + - [84, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [128, 387.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 4] + - [128, 91.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 64] + - [91, 393.0] + - - [4, 64, 1, 128, 4, 4, 128, 64] + - [92, 6.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 128] + - [127, 610.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 128] + - [125, 1120.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [111, 1056.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 64] + - [109, 1114.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 4] + - [116, 270.0] + - - [704, 64, 1, 256, 704, 704, 256, 64] + - [128, 1290.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 448] + - [129, 2457.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 64] + - [110, 752.0] + - - [448, 64, 1, 256, 448, 448, 256, 64] + - [84, 1192.0] + - - [4, 256, 1, 128, 4, 4, 128, 256] + - [84, 35.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 64] + - [135, 1856.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [111, 196.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 4] + - [91, 2.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1024] + - [88, 268.0] + - - [704, 4, 1, 256, 704, 704, 256, 4] + - [128, 104.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 4] + - [91, 49.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 4] + - [110, 163.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 4] + - [116, 230.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 128] + - [103, 2594.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 64] + - [129, 3012.0] + - - [4, 1024, 1, 128, 4, 4, 128, 1024] + - [116, 89.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 256] + - [90, 1299.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [124, 1385.0] + - - [128, 256, 1, 128, 128, 128, 128, 256] + - [132, 721.0] + - - [128, 4, 1, 256, 128, 128, 256, 4] + - [90, 18.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 256] + - [85, 2789.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 128] + - [121, 1831.0] + - - [448, 4, 1, 128, 448, 448, 128, 4] + - [92, 40.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 256] + - [110, 98.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [91, 15.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 256] + - [101, 79.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 4] + - [137, 26.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 64] + - [91, 24.0] + - - [4, 1024, 1, 256, 4, 4, 256, 1024] + - [127, 184.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [88, 870.0] + - - [4, 64, 1, 256, 4, 4, 256, 64] + - [137, 12.0] + - - [128, 448, 1, 128, 128, 128, 128, 448] + - [85, 1498.0] + - - [64, 448, 1, 128, 64, 64, 128, 448] + - [138, 725.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 704] + - [118, 2200.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 448] + - [119, 2719.0] + - - [4, 448, 1, 256, 4, 4, 256, 448] + - [91, 87.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 128] + - [91, 39.0] + - - [128, 64, 1, 128, 128, 128, 128, 64] + - [102, 252.0] + - - [64, 64, 1, 128, 64, 64, 128, 64] + - [123, 161.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 4] + - [101, 20.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 64] + - [90, 1294.0] + - - [128, 128, 1, 256, 128, 128, 256, 128] + - [92, 597.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB.yaml new file mode 100644 index 000000000..dfc2c88b5 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB.yaml @@ -0,0 +1,23271 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x16_SN_SU32_SUM3_TT16_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x16_SN_SU0_SUM0_TT16_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x16_SN_SU32_SUM3_TT16_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x16_SN_SU0_SUM0_TT16_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT256x256x16_SN_SU32_SUM3_TT16_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [3, 23592.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [3, 22773.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [10, 22008.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [4, 19376.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [17, 23369.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [24, 22230.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [16, 16561.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [11, 24376.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [13, 18421.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [3, 24373.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [19, 23547.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [3, 22344.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [5, 24601.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [3, 18846.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 23187.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [16, 17797.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [11, 24342.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [19, 24250.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [16, 16615.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [12, 15827.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [10, 24141.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [26, 21288.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [3, 24271.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [3, 19924.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 24253.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [10, 22183.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [24, 21245.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [12, 16073.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [3, 24731.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [29, 22952.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [3, 22152.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [10, 22332.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [10, 24577.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [19, 22037.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [3, 24082.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 18307.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [3, 24297.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [13, 22388.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [23, 19230.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [29, 22711.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [8, 23729.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 22993.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [3, 18331.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [13, 24541.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [15, 16947.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [26, 20906.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [13, 21512.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [28, 16650.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [3, 22585.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [29, 19315.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [32, 20423.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [7, 23887.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [26, 17624.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [10, 16081.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [29, 14610.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [29, 18219.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [10, 20023.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [29, 18522.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [10, 18724.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [29, 18248.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [7, 21874.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [26, 20486.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [13, 24267.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [10, 22684.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [3, 23876.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 22790.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [26, 23537.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [10, 23555.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [24, 22283.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [26, 17814.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [11, 24475.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [3, 24131.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [11, 22090.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [19, 23700.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 25047.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [11, 24523.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 16432.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [9, 21027.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 23496.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [3, 22883.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [7, 19987.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [2, 17174.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [23, 18358.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [3, 21692.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 22603.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [3, 23183.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [5, 23651.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 23687.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [13, 20944.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [19, 17920.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [10, 19222.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [3, 24009.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [10, 22665.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [33, 24353.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [1, 21461.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [20, 23995.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [3, 23437.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [13, 19157.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [29, 20840.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [19, 24566.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [16, 16855.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [16, 23769.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [10, 22756.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [3, 23761.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 23723.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 24624.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [10, 23284.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [19, 24033.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [4, 22832.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [17, 22297.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [3, 22445.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [19, 19995.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [13, 19700.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [24, 22995.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [25, 16151.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [34, 20675.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [0, 20252.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [3, 21889.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [5, 21635.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [26, 22729.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [7, 20968.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [3, 20274.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [4, 15543.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [7, 24421.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [3, 21764.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [3, 25035.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [7, 21406.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [3, 24713.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [12, 16882.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [5, 24557.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [1, 23702.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [17, 24959.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [3, 22468.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [8, 24176.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 21275.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [3, 23958.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [16, 23008.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [10, 19286.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 23658.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [23, 22544.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 22934.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [3, 18288.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [3, 24019.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [3, 20133.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [10, 24292.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 22403.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [10, 21133.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [4, 23545.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [24, 25232.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [28, 10915.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 24217.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [4, 17518.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [3, 22596.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [4, 19391.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [16, 23216.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [3, 24182.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [12, 17523.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [10, 22111.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [26, 23861.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 23172.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [10, 23835.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [3, 22670.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 19676.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [19, 19363.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [16, 23854.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [16, 21787.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [7, 23830.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [23, 22939.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 19454.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [29, 16591.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [26, 23702.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [20, 24393.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 24779.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [11, 23963.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [5, 23851.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [11, 23159.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [29, 22153.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [3, 23102.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [7, 21000.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [20, 22930.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [3, 23493.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [19, 20398.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [13, 22859.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [26, 22865.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [10, 18453.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [19, 18368.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [7, 19330.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [13, 20206.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [15, 14777.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [13, 15552.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [5, 21345.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [10, 22591.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [3, 22137.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 21805.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [7, 24603.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [2, 14585.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [23, 20915.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [3, 24556.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [14, 25029.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [27, 24238.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [10, 24327.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 23812.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [19, 19432.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [3, 24977.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [6, 15888.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [3, 24020.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [33, 23393.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [19, 18532.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [19, 19058.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [26, 20519.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [3, 23319.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [23, 17969.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [24, 24402.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [26, 17943.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [29, 18025.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [28, 16084.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [11, 22590.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 20582.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [23, 23850.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [4, 22093.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [23, 18480.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [3, 24305.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [7, 24401.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [7, 17833.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [3, 22702.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [24, 20913.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [3, 20774.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [10, 19428.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [4, 22904.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [3, 24698.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [17, 24254.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [29, 21597.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [3, 24088.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [10, 22279.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [23, 21752.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [3, 24416.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 22714.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [17, 22573.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [19, 22148.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [20, 19069.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [11, 24265.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [26, 20528.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [15, 17553.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 24424.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [13, 24645.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 19831.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 23906.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [4, 22219.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [4, 22953.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [24, 22106.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [11, 24678.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [29, 22331.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [3, 22742.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [16, 19192.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [16, 19356.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [26, 24386.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [3, 22989.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [13, 18487.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [23, 21386.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [24, 21899.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [3, 24327.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [10, 23018.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [0, 19976.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [3, 22855.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [16, 20103.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [32, 18350.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [7, 22587.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [10, 18945.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [11, 21975.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [13, 18769.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [26, 22603.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [0, 18770.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [4, 23724.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [26, 24483.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [7, 23298.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [10, 23774.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [10, 21186.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [3, 22566.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [19, 24544.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [25, 17257.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [19, 18951.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [3, 22654.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [3, 21764.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [3, 18504.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [29, 20215.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [7, 22513.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [30, 24621.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [7, 23837.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [4, 20702.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [12, 18007.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [26, 18754.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [3, 23153.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [3, 20467.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [26, 20928.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [20, 24863.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [3, 19105.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 23593.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 23997.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [4, 20060.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [3, 20385.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [10, 16049.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [3, 23435.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [3, 21686.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [3, 22716.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [2, 16830.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [3, 18694.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [26, 23372.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [3, 24308.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 21826.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [26, 22413.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 23446.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [29, 19418.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [10, 24115.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [23, 17590.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [3, 23908.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [3, 19178.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [29, 16276.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [16, 17859.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [10, 19714.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [10, 23295.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [31, 17242.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [3, 23648.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [10, 19210.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [3, 24522.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [32, 21551.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 17364.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [7, 20572.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [4, 20504.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [13, 23619.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [16, 21425.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [17, 23497.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [3, 20365.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [3, 24383.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [7, 21268.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [29, 22543.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [3, 18773.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [23, 15773.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [4, 19539.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [7, 24434.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [10, 23946.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 19410.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [5, 23550.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [4, 17877.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [19, 24482.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [26, 24052.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [23, 18883.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [7, 17340.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [10, 18534.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [29, 16093.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [24, 24031.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [26, 24295.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [16, 23129.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [23, 18637.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [32, 21967.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [3, 24419.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [12, 16419.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [7, 17194.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [20, 24322.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [11, 24694.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [32, 21901.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [32, 22049.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [10, 22120.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [33, 23507.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [8, 25196.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [24, 25264.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [13, 18138.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 22536.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [7, 24095.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [3, 24233.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 25041.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [3, 20369.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [24, 23919.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [22, 17010.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [20, 25007.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [13, 21971.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [16, 21481.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [3, 20704.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [5, 24904.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [28, 17183.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [7, 22828.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [3, 24727.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [5, 24769.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [3, 24210.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [21, 18622.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [17, 25151.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [10, 22646.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [16, 23526.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [19, 23887.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [7, 18586.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [23, 23043.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [10, 22509.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [3, 23460.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [23, 20034.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 22992.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 19676.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [3, 24502.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [4, 19901.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [29, 19689.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [32, 22779.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [30, 21690.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [6, 17700.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [3, 23085.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [17, 24526.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [3, 23545.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [3, 23047.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [9, 20685.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [16, 21515.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [3, 22613.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [16, 23184.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [10, 22863.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [24, 24747.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [17, 24410.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [10, 21187.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [13, 20571.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [8, 25149.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [10, 18565.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [10, 19371.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 22601.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [19, 23973.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 24042.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [24, 21119.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [4, 23664.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [30, 23612.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [10, 23938.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [29, 23281.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [4, 20860.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [3, 25052.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [13, 18808.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [10, 19245.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [16, 21490.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [3, 24146.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [10, 20286.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [30, 24262.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 21704.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [15, 17075.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [3, 23395.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [7, 21982.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [10, 21936.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [13, 22147.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 24271.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [19, 22081.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [29, 20030.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [3, 21164.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [30, 24311.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [4, 19965.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [27, 20673.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [10, 21348.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [12, 17628.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [3, 24598.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [10, 23788.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 22825.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [19, 20196.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [29, 22836.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [4, 18117.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [10, 22969.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [24, 25254.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [10, 18472.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [33, 24614.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [41, 16632.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [54, 10050.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [49, 17890.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [48, 13788.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [45, 17388.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [45, 16721.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [35, 13407.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [47, 8603.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [54, 8869.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [55, 15863.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [51, 13056.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [40, 14606.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [48, 15340.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [55, 13034.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [52, 15931.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [48, 17647.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [48, 15856.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [52, 12193.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [55, 11725.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [56, 17465.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [40, 10545.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [41, 15015.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [55, 9330.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [39, 12252.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [54, 13397.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [45, 16594.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [41, 18935.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [40, 13208.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [48, 13070.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [41, 15122.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [48, 9312.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [44, 14980.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [50, 12774.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [48, 12860.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [49, 17865.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [53, 14806.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [41, 18963.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [48, 16146.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [48, 12760.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [37, 16274.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [55, 11142.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [36, 14044.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [49, 15514.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [52, 16124.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [55, 14336.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [37, 16915.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [47, 8999.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [45, 16216.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [37, 16240.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [40, 12679.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [55, 15579.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [45, 16326.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [41, 16384.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [40, 13743.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [48, 13563.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [41, 17460.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [53, 15831.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [48, 16440.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [37, 15807.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [42, 13919.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [41, 14722.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [47, 8776.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [52, 15876.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [40, 11690.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [43, 13015.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [52, 15940.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [47, 14840.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [52, 16103.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [55, 15081.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [49, 16545.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [48, 15141.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [41, 13232.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [54, 12968.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [35, 14131.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [53, 16146.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [55, 11271.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [41, 17094.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [55, 13491.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [45, 16755.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [54, 10249.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [45, 16306.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [49, 19448.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [36, 12193.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [55, 14329.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [55, 13401.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [45, 16924.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [40, 13645.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [39, 8926.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [56, 18933.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [36, 15913.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [49, 15453.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [55, 9421.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [48, 15759.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [52, 15158.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [44, 16234.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [53, 15428.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [35, 14130.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [48, 15793.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [45, 14822.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [47, 13634.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [54, 12081.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [56, 14773.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [55, 14780.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [49, 19090.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [47, 8757.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [55, 16195.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [48, 13276.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [35, 14108.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [48, 15152.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [46, 12278.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [41, 17841.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [43, 13031.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [36, 15550.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [48, 11584.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [55, 16481.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [40, 12786.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [48, 14189.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [45, 17426.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [40, 13606.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [48, 12981.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [40, 13866.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [45, 16551.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [55, 8679.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [47, 10599.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [40, 13075.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [41, 17625.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [44, 8976.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [40, 8994.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [53, 16707.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [52, 16675.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [48, 15197.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [47, 10977.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [45, 17882.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [52, 16219.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [56, 18390.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [40, 13841.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [40, 15098.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [41, 16244.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [50, 11617.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [40, 12763.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [37, 13906.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [48, 13360.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [41, 18333.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [48, 12509.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [49, 17610.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [41, 17130.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [47, 11228.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [44, 12647.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [55, 16384.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [41, 15142.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [52, 15156.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [45, 16274.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [41, 14741.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [40, 14035.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [56, 15550.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [37, 16571.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [54, 10998.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [38, 12923.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [37, 16744.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [52, 15097.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [41, 15018.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [51, 13818.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [46, 12901.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [45, 16721.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [77, 4719.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [93, 4726.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [79, 5992.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [91, 5999.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [73, 7756.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [85, 7264.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [69, 5550.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [91, 6111.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [77, 4938.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [91, 7508.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [59, 6543.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [71, 8180.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [66, 6644.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [79, 6281.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [69, 5796.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [59, 7086.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [91, 6038.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [91, 4943.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [91, 6631.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [79, 6904.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [69, 4350.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [85, 6447.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [77, 5140.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [71, 8219.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [90, 5169.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [69, 5514.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [59, 6569.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [71, 8040.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [91, 6481.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [93, 5061.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [70, 5348.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [77, 5885.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [58, 6736.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [91, 7387.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [91, 6238.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [70, 5310.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [71, 7380.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [81, 6977.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [71, 6722.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [90, 5189.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [90, 4149.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [91, 6173.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [85, 7381.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [79, 8032.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [91, 7179.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [91, 5035.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [59, 7362.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [59, 7602.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [59, 6751.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [59, 7308.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [85, 7905.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [77, 5900.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [66, 5412.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [91, 5940.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [84, 4806.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [91, 5796.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [59, 6543.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [59, 6086.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [85, 6683.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [59, 7837.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [59, 6907.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [91, 6145.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [59, 8400.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [71, 6779.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [58, 6813.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [77, 4005.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [91, 6278.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [59, 7205.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [91, 7399.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [91, 6755.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [59, 7380.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [70, 5234.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [79, 6973.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [85, 7993.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [59, 7366.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [71, 6382.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [66, 5649.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [69, 5143.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [75, 4489.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [59, 7490.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [66, 6974.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [58, 6669.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [59, 6692.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [84, 5541.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [66, 7435.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [91, 6794.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [71, 6875.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [59, 6675.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [59, 7291.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [81, 5578.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [90, 3977.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [59, 6561.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [59, 8121.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [58, 6462.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [91, 5767.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [59, 6359.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [59, 6375.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [94, 3775.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [61, 2446.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [95, 3809.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [95, 3631.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [95, 3662.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [96, 2567.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [68, 2940.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [95, 3164.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [95, 3228.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [76, 3633.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [95, 3446.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [82, 1499.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [82, 751.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [62, 968.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [87, 484.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [88, 2958.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [80, 2637.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [84, 4401.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [88, 3600.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [65, 3291.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [70, 5117.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [74, 2695.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [62, 537.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [83, 3608.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [80, 613.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [78, 4073.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [89, 3381.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [60, 2853.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [92, 940.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [60, 1023.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [60, 1897.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [88, 2255.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [77, 3410.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [88, 1165.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [75, 3369.0] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [74, 2708.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [78, 3205.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [88, 3701.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [86, 2850.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [70, 4345.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [60, 1742.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [80, 307.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [63, 2005.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [74, 2937.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [88, 3172.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [67, 936.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [80, 1768.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [88, 2267.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [80, 1748.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [76, 2695.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [80, 478.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [60, 1021.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [89, 1942.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [64, 3355.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [80, 610.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [90, 4420.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [58, 4934.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [65, 4051.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [58, 4853.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [60, 1891.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [58, 4415.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [88, 3095.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [57, 1146.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [70, 4505.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [60, 1900.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [88, 1159.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [58, 4225.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [86, 1885.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [72, 1872.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [84, 4177.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [62, 536.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH.yaml new file mode 100644 index 000000000..5d52acf61 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH.yaml @@ -0,0 +1,60142 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x32_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x32_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x16x32_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x8x8_SN_SU0_SUM0_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x8x32_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x16_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT32x32x8_SN_SU0_SUM0_TT2_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Alik_Bljk_HBH_MT16x32x16_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [2368, 1024, 1, 1, 2368, 2368, 1, 1] + - [2, 312.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 20679.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 19930.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [7, 16924.0] + - - [5056, 4288, 1, 32, 5056, 5056, 32, 32] + - [18, 15056.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [32, 18124.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [7, 20760.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 19486.0] + - - [448, 3584, 1, 32, 448, 448, 32, 32] + - [28, 4866.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [7, 21216.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [15, 21550.0] + - - [2368, 1408, 1, 32, 2368, 2368, 32, 32] + - [45, 7776.0] + - - [1024, 2944, 1, 1, 1024, 1024, 1, 1] + - [22, 302.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [7, 21001.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [32, 20180.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [15, 22111.0] + - - [3584, 1408, 1, 32, 3584, 3584, 32, 32] + - [11, 8367.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [14, 15615.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [41, 17586.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [7, 19873.0] + - - [3584, 4288, 1, 32, 3584, 3584, 32, 32] + - [31, 13940.0] + - - [3584, 3584, 1, 1, 3584, 3584, 1, 1] + - [30, 475.0] + - - [1408, 2368, 1, 1, 1408, 1408, 1, 1] + - [45, 317.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [34, 21293.0] + - - [4288, 5056, 1, 1, 4288, 4288, 1, 1] + - [45, 454.0] + - - [5056, 4288, 1, 1, 5056, 5056, 1, 1] + - [28, 531.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 20254.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [7, 20149.0] + - - [1024, 5056, 1, 1, 1024, 1024, 1, 1] + - [22, 384.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [13, 17025.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [23, 19102.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [13, 17038.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [25, 21703.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 21556.0] + - - [2368, 3584, 1, 32, 2368, 2368, 32, 32] + - [37, 10855.0] + - - [2944, 2368, 1, 1, 2944, 2944, 1, 1] + - [31, 349.0] + - - [704, 4288, 1, 1, 704, 704, 1, 1] + - [4, 316.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 20729.0] + - - [1024, 3584, 1, 1, 1024, 1024, 1, 1] + - [18, 324.0] + - - [256, 5056, 1, 32, 256, 256, 32, 32] + - [0, 8284.0] + - - [2368, 5056, 1, 32, 2368, 2368, 32, 32] + - [18, 11847.0] + - - [6784, 1856, 1, 32, 6784, 6784, 32, 32] + - [22, 10515.0] + - - [5056, 704, 1, 1, 5056, 5056, 1, 1] + - [18, 350.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [23, 19788.0] + - - [5056, 704, 1, 32, 5056, 5056, 32, 32] + - [45, 9306.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [25, 20409.0] + - - [6784, 4288, 1, 32, 6784, 6784, 32, 32] + - [18, 14880.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [25, 20629.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [5, 20426.0] + - - [4288, 1856, 1, 1, 4288, 4288, 1, 1] + - [50, 401.0] + - - [1856, 2944, 1, 1, 1856, 1856, 1, 1] + - [28, 380.0] + - - [1856, 2368, 1, 32, 1856, 1856, 32, 32] + - [28, 8992.0] + - - [4288, 1856, 1, 32, 4288, 4288, 32, 32] + - [0, 11901.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [13, 19286.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [7, 19988.0] + - - [5056, 6784, 1, 1, 5056, 5056, 1, 1] + - [40, 833.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [13, 17672.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [51, 18445.0] + - - [704, 2368, 1, 1, 704, 704, 1, 1] + - [48, 189.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [25, 21338.0] + - - [3584, 2368, 1, 1, 3584, 3584, 1, 1] + - [37, 355.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 16914.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [13, 18647.0] + - - [2368, 4288, 1, 32, 2368, 2368, 32, 32] + - [18, 12402.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [7, 18567.0] + - - [3584, 6784, 1, 32, 3584, 3584, 32, 32] + - [29, 15202.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 21506.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [34, 19814.0] + - - [1408, 3584, 1, 1, 1408, 1408, 1, 1] + - [20, 312.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 19622.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [32, 19422.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [7, 21004.0] + - - [3584, 704, 1, 1, 3584, 3584, 1, 1] + - [48, 277.0] + - - [448, 5056, 1, 1, 448, 448, 1, 1] + - [31, 217.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [43, 21240.0] + - - [3584, 704, 1, 32, 3584, 3584, 32, 32] + - [37, 10708.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [13, 17695.0] + - - [6784, 4288, 1, 1, 6784, 6784, 1, 1] + - [9, 848.0] + - - [3584, 6784, 1, 1, 3584, 3584, 1, 1] + - [40, 806.0] + - - [1408, 2368, 1, 32, 1408, 1408, 32, 32] + - [37, 8688.0] + - - [448, 5056, 1, 32, 448, 448, 32, 32] + - [45, 5931.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21251.0] + - - [6784, 1408, 1, 1, 6784, 6784, 1, 1] + - [11, 412.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [15, 19948.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [32, 20180.0] + - - [5056, 5888, 1, 1, 5056, 5056, 1, 1] + - [18, 771.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [41, 16799.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [7, 20296.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [41, 20749.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [32, 18957.0] + - - [3584, 3584, 1, 32, 3584, 3584, 32, 32] + - [28, 13812.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [13, 20807.0] + - - [1408, 5056, 1, 1, 1408, 1408, 1, 1] + - [12, 383.0] + - - [2368, 6784, 1, 1, 2368, 2368, 1, 1] + - [0, 420.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [25, 21574.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [23, 15972.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 20847.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [32, 18181.0] + - - [5888, 256, 1, 1, 5888, 5888, 1, 1] + - [24, 222.0] + - - [5056, 6784, 1, 32, 5056, 5056, 32, 32] + - [28, 14709.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [15, 17561.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [32, 18102.0] + - - [5888, 1024, 1, 1, 5888, 5888, 1, 1] + - [20, 378.0] + - - [5888, 448, 1, 32, 5888, 5888, 32, 32] + - [37, 7918.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [23, 20737.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [32, 19790.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [7, 17510.0] + - - [1408, 4288, 1, 1, 1408, 1408, 1, 1] + - [28, 333.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 18309.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [25, 19687.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [7, 21751.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [32, 16452.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [13, 13617.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [34, 20421.0] + - - [1408, 1024, 1, 1, 1408, 1408, 1, 1] + - [2, 174.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [23, 14024.0] + - - [5056, 3584, 1, 1, 5056, 5056, 1, 1] + - [22, 741.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [15, 21615.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 20097.0] + - - [5056, 3584, 1, 32, 5056, 5056, 32, 32] + - [18, 14157.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [25, 21581.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [5, 16765.0] + - - [5888, 4288, 1, 1, 5888, 5888, 1, 1] + - [48, 827.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [25, 15560.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [32, 19394.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [32, 18956.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [32, 15107.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [43, 21563.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21227.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [5, 18936.0] + - - [2944, 1856, 1, 1, 2944, 2944, 1, 1] + - [39, 368.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [25, 21429.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [25, 21745.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 20130.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [13, 17865.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 19439.0] + - - [1024, 2368, 1, 1, 1024, 1024, 1, 1] + - [31, 313.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [7, 20990.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [41, 18785.0] + - - [448, 3584, 1, 1, 448, 448, 1, 1] + - [18, 247.0] + - - [2368, 2944, 1, 32, 2368, 2368, 32, 32] + - [33, 10328.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [32, 16067.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [5, 16674.0] + - - [704, 6784, 1, 32, 704, 704, 32, 32] + - [9, 10497.0] + - - [1024, 4288, 1, 1, 1024, 1024, 1, 1] + - [1, 367.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [7, 21250.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [41, 19099.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [43, 19197.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [23, 19558.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 19692.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [25, 21143.0] + - - [5888, 6784, 1, 32, 5888, 5888, 32, 32] + - [38, 16921.0] + - - [6784, 6784, 1, 32, 6784, 6784, 32, 32] + - [10, 17408.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [23, 16565.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [7, 20587.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 20430.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 21691.0] + - - [6784, 448, 1, 1, 6784, 6784, 1, 1] + - [4, 293.0] + - - [6784, 1856, 1, 1, 6784, 6784, 1, 1] + - [37, 401.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [32, 20219.0] + - - [6784, 448, 1, 32, 6784, 6784, 32, 32] + - [45, 9175.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [32, 17487.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [15, 16039.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [43, 20953.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [25, 21657.0] + - - [5888, 704, 1, 1, 5888, 5888, 1, 1] + - [9, 352.0] + - - [3584, 1856, 1, 1, 3584, 3584, 1, 1] + - [28, 341.0] + - - [5056, 2944, 1, 32, 5056, 5056, 32, 32] + - [18, 12151.0] + - - [4288, 6784, 1, 1, 4288, 4288, 1, 1] + - [50, 706.0] + - - [1024, 6784, 1, 1, 1024, 1024, 1, 1] + - [0, 467.0] + - - [2368, 5888, 1, 32, 2368, 2368, 32, 32] + - [22, 11493.0] + - - [3584, 4288, 1, 1, 3584, 3584, 1, 1] + - [12, 370.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [32, 20575.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [25, 21243.0] + - - [5056, 1024, 1, 1, 5056, 5056, 1, 1] + - [18, 598.0] + - - [4288, 2368, 1, 32, 4288, 4288, 32, 32] + - [18, 15071.0] + - - [704, 3584, 1, 1, 704, 704, 1, 1] + - [18, 395.0] + - - [6784, 704, 1, 32, 6784, 6784, 32, 32] + - [37, 13085.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [25, 17390.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 20391.0] + - - [3584, 5056, 1, 32, 3584, 3584, 32, 32] + - [28, 12761.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 20490.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [15, 21485.0] + - - [2368, 5056, 1, 1, 2368, 2368, 1, 1] + - [39, 371.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [32, 19590.0] + - - [2368, 1024, 1, 32, 2368, 2368, 32, 32] + - [9, 8380.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [15, 18610.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21684.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [32, 19491.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [25, 20751.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [25, 20579.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [32, 20299.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [25, 21744.0] + - - [5888, 2944, 1, 1, 5888, 5888, 1, 1] + - [18, 402.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [32, 20364.0] + - - [704, 2944, 1, 32, 704, 704, 32, 32] + - [24, 5901.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [25, 21087.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [41, 20164.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [23, 17503.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 20292.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [31, 14484.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [5, 16358.0] + - - [1024, 1408, 1, 1, 1024, 1024, 1, 1] + - [20, 264.0] + - - [256, 5888, 1, 1, 256, 256, 1, 1] + - [9, 315.0] + - - [2944, 2944, 1, 1, 2944, 2944, 1, 1] + - [33, 523.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [15, 20998.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [23, 16526.0] + - - [2944, 2944, 1, 32, 2944, 2944, 32, 32] + - [37, 10792.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 20925.0] + - - [6784, 1408, 1, 32, 6784, 6784, 32, 32] + - [0, 12736.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [23, 19290.0] + - - [4288, 3584, 1, 32, 4288, 4288, 32, 32] + - [37, 15067.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 17155.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [25, 18282.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [32, 13415.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [34, 18896.0] + - - [2944, 5888, 1, 32, 2944, 2944, 32, 32] + - [9, 15617.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [13, 20187.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [23, 19207.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [15, 20044.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [41, 15689.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 17290.0] + - - [2368, 704, 1, 1, 2368, 2368, 1, 1] + - [0, 218.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 17695.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [15, 19803.0] + - - [448, 4288, 1, 32, 448, 448, 32, 32] + - [45, 5166.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [32, 16306.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [32, 18286.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21666.0] + - - [1856, 3584, 1, 32, 1856, 1856, 32, 32] + - [2, 10580.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [15, 20199.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [34, 20187.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [23, 19482.0] + - - [2368, 3584, 1, 1, 2368, 2368, 1, 1] + - [39, 465.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [49, 19147.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [32, 20239.0] + - - [5888, 3584, 1, 1, 5888, 5888, 1, 1] + - [29, 736.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [7, 21846.0] + - - [6784, 5056, 1, 1, 6784, 6784, 1, 1] + - [46, 424.0] + - - [5888, 3584, 1, 32, 5888, 5888, 32, 32] + - [28, 15175.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [43, 21997.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [32, 17070.0] + - - [6784, 5888, 1, 32, 6784, 6784, 32, 32] + - [18, 11449.0] + - - [2368, 6784, 1, 32, 2368, 2368, 32, 32] + - [9, 12245.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [49, 20794.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [32, 20043.0] + - - [2944, 3584, 1, 1, 2944, 2944, 1, 1] + - [24, 455.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [32, 19737.0] + - - [3584, 1024, 1, 1, 3584, 3584, 1, 1] + - [48, 339.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 19487.0] + - - [2944, 3584, 1, 32, 2944, 2944, 32, 32] + - [53, 10941.0] + - - [5888, 256, 1, 32, 5888, 5888, 32, 32] + - [37, 8770.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [25, 20535.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [13, 20094.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [32, 16241.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [34, 20669.0] + - - [2368, 1408, 1, 1, 2368, 2368, 1, 1] + - [50, 258.0] + - - [1024, 1856, 1, 32, 1024, 1024, 32, 32] + - [31, 6193.0] + - - [5888, 2368, 1, 1, 5888, 5888, 1, 1] + - [18, 499.0] + - - [2368, 2368, 1, 1, 2368, 2368, 1, 1] + - [18, 387.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [49, 15797.0] + - - [5888, 2368, 1, 32, 5888, 5888, 32, 32] + - [9, 14264.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 20496.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [15, 20932.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [34, 19239.0] + - - [1856, 1856, 1, 32, 1856, 1856, 32, 32] + - [20, 7796.0] + - - [4288, 2944, 1, 32, 4288, 4288, 32, 32] + - [4, 13315.0] + - - [256, 5056, 1, 1, 256, 256, 1, 1] + - [31, 214.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [34, 20248.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [32, 16309.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [23, 20859.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [13, 17585.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 17472.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [32, 20634.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [32, 14893.0] + - - [6784, 256, 1, 32, 6784, 6784, 32, 32] + - [18, 8603.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 20771.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [25, 20635.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [13, 17546.0] + - - [5888, 1856, 1, 32, 5888, 5888, 32, 32] + - [18, 14391.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [25, 19399.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [32, 14997.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [15, 20667.0] + - - [1856, 1408, 1, 32, 1856, 1856, 32, 32] + - [0, 8533.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 21100.0] + - - [448, 5888, 1, 1, 448, 448, 1, 1] + - [37, 261.0] + - - [3584, 1408, 1, 1, 3584, 3584, 1, 1] + - [18, 367.0] + - - [448, 5888, 1, 32, 448, 448, 32, 32] + - [37, 7816.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [7, 18539.0] + - - [1856, 6784, 1, 1, 1856, 1856, 1, 1] + - [12, 514.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [5, 15684.0] + - - [1856, 6784, 1, 32, 1856, 1856, 32, 32] + - [37, 12889.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [7, 20362.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [25, 21720.0] + - - [1408, 6784, 1, 32, 1408, 1408, 32, 32] + - [0, 9797.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [23, 18587.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [5, 16266.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [23, 19757.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [13, 17905.0] + - - [1856, 2368, 1, 1, 1856, 1856, 1, 1] + - [4, 317.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [43, 20648.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [25, 20105.0] + - - [5056, 5056, 1, 32, 5056, 5056, 32, 32] + - [22, 14868.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [7, 17068.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [23, 14061.0] + - - [4288, 5888, 1, 1, 4288, 4288, 1, 1] + - [37, 786.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [7, 20372.0] + - - [2368, 4288, 1, 1, 2368, 2368, 1, 1] + - [12, 393.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [13, 18558.0] + - - [4288, 5888, 1, 32, 4288, 4288, 32, 32] + - [18, 15786.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [7, 21660.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [25, 19275.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [15, 17811.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [49, 15446.0] + - - [1024, 5888, 1, 32, 1024, 1024, 32, 32] + - [9, 11192.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [15, 20122.0] + - - [5056, 2368, 1, 32, 5056, 5056, 32, 32] + - [18, 11575.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [32, 18578.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [15, 21376.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [41, 18385.0] + - - [704, 5056, 1, 32, 704, 704, 32, 32] + - [37, 8590.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [43, 21277.0] + - - [4288, 448, 1, 1, 4288, 4288, 1, 1] + - [9, 296.0] + - - [5888, 5888, 1, 1, 5888, 5888, 1, 1] + - [44, 638.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 17807.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 19794.0] + - - [2368, 2944, 1, 1, 2368, 2368, 1, 1] + - [2, 413.0] + - - [5056, 256, 1, 32, 5056, 5056, 32, 32] + - [9, 4279.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [43, 20786.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [41, 20601.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [5, 16381.0] + - - [4288, 4288, 1, 32, 4288, 4288, 32, 32] + - [37, 14851.0] + - - [5888, 448, 1, 1, 5888, 5888, 1, 1] + - [0, 372.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [25, 21326.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 20007.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [15, 22075.0] + - - [5888, 1408, 1, 32, 5888, 5888, 32, 32] + - [9, 8993.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [7, 18530.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 20633.0] + - - [2944, 1408, 1, 1, 2944, 2944, 1, 1] + - [30, 313.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [15, 19906.0] + - - [5056, 1408, 1, 32, 5056, 5056, 32, 32] + - [9, 10786.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [23, 19403.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [13, 20129.0] + - - [704, 2368, 1, 32, 704, 704, 32, 32] + - [37, 4885.0] + - - [704, 6784, 1, 1, 704, 704, 1, 1] + - [37, 343.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [32, 19919.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [7, 20329.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [32, 17735.0] + - - [6784, 2944, 1, 32, 6784, 6784, 32, 32] + - [6, 13816.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [41, 20196.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [32, 16349.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [32, 16786.0] + - - [2944, 5056, 1, 32, 2944, 2944, 32, 32] + - [18, 15576.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [7, 17628.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [32, 20077.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 19337.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [51, 20425.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [49, 14788.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [32, 17874.0] + - - [4288, 704, 1, 1, 4288, 4288, 1, 1] + - [0, 309.0] + - - [1856, 1024, 1, 1, 1856, 1856, 1, 1] + - [37, 236.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 20526.0] + - - [4288, 704, 1, 32, 4288, 4288, 32, 32] + - [37, 7918.0] + - - [1856, 1024, 1, 32, 1856, 1856, 32, 32] + - [30, 7583.0] + - - [2944, 6784, 1, 1, 2944, 2944, 1, 1] + - [20, 784.0] + - - [6784, 2368, 1, 32, 6784, 6784, 32, 32] + - [12, 14295.0] + - - [5888, 5056, 1, 1, 5888, 5888, 1, 1] + - [20, 752.0] + - - [704, 5888, 1, 1, 704, 704, 1, 1] + - [33, 311.0] + - - [6784, 6784, 1, 1, 6784, 6784, 1, 1] + - [46, 446.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 17314.0] + - - [704, 5888, 1, 32, 704, 704, 32, 32] + - [33, 8290.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [15, 19189.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 20038.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [13, 17332.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [32, 20390.0] + - - [1408, 1408, 1, 32, 1408, 1408, 32, 32] + - [37, 9221.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [13, 17975.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [23, 17546.0] + - - [4288, 2944, 1, 1, 4288, 4288, 1, 1] + - [4, 480.0] + - - [6784, 5056, 1, 32, 6784, 6784, 32, 32] + - [37, 15649.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 21294.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [32, 18337.0] + - - [2368, 5888, 1, 1, 2368, 2368, 1, 1] + - [42, 414.0] + - - [1408, 1856, 1, 32, 1408, 1408, 32, 32] + - [0, 8877.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [41, 19402.0] + - - [1024, 2368, 1, 32, 1024, 1024, 32, 32] + - [9, 7198.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [32, 19064.0] + - - [3584, 5888, 1, 32, 3584, 3584, 32, 32] + - [45, 15822.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [15, 21771.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [13, 17521.0] + - - [3584, 1024, 1, 32, 3584, 3584, 32, 32] + - [18, 9306.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [7, 19934.0] + - - [5888, 5888, 1, 32, 5888, 5888, 32, 32] + - [28, 16387.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [49, 18054.0] + - - [4288, 1024, 1, 1, 4288, 4288, 1, 1] + - [33, 352.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [13, 17436.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [15, 21229.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 21334.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 17965.0] + - - [1408, 5056, 1, 32, 1408, 1408, 32, 32] + - [37, 13528.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 20298.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 17901.0] + - - [1856, 5056, 1, 1, 1856, 1856, 1, 1] + - [48, 416.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 20209.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [23, 15499.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [23, 15996.0] + - - [1856, 5056, 1, 32, 1856, 1856, 32, 32] + - [0, 11113.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [34, 20252.0] + - - [1024, 2944, 1, 32, 1024, 1024, 32, 32] + - [18, 8039.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [41, 19438.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 19154.0] + - - [1856, 3584, 1, 1, 1856, 1856, 1, 1] + - [41, 416.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 21818.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [32, 20306.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 18830.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [49, 16576.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [15, 20487.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [32, 19122.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [13, 16990.0] + - - [4288, 1024, 1, 32, 4288, 4288, 32, 32] + - [22, 10182.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [41, 19856.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [32, 18782.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [13, 20382.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [49, 12943.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [41, 21267.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [13, 18966.0] + - - [5888, 704, 1, 32, 5888, 5888, 32, 32] + - [18, 9584.0] + - - [2944, 704, 1, 1, 2944, 2944, 1, 1] + - [9, 316.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [23, 19415.0] + - - [1856, 1856, 1, 1, 1856, 1856, 1, 1] + - [4, 318.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [32, 18014.0] + - - [2368, 1856, 1, 32, 2368, 2368, 32, 32] + - [18, 11779.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [25, 21436.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [41, 16006.0] + - - [256, 6784, 1, 1, 256, 256, 1, 1] + - [50, 195.0] + - - [1024, 3584, 1, 32, 1024, 1024, 32, 32] + - [28, 8661.0] + - - [256, 6784, 1, 32, 256, 256, 32, 32] + - [9, 6080.0] + - - [2944, 1408, 1, 32, 2944, 2944, 32, 32] + - [18, 9943.0] + - - [4288, 3584, 1, 1, 4288, 4288, 1, 1] + - [10, 566.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [25, 17727.0] + - - [6784, 3584, 1, 32, 6784, 6784, 32, 32] + - [11, 15699.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [32, 18792.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [5, 18271.0] + - - [2944, 5888, 1, 1, 2944, 2944, 1, 1] + - [8, 448.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [13, 18955.0] + - - [5888, 1024, 1, 32, 5888, 5888, 32, 32] + - [18, 11512.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [32, 20761.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [41, 20496.0] + - - [2944, 5056, 1, 1, 2944, 2944, 1, 1] + - [45, 536.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 22004.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 20947.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 18654.0] + - - [6784, 2944, 1, 1, 6784, 6784, 1, 1] + - [37, 785.0] + - - [2944, 1024, 1, 32, 2944, 2944, 32, 32] + - [37, 8403.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 19313.0] + - - [5056, 5056, 1, 1, 5056, 5056, 1, 1] + - [47, 654.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [41, 19128.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 21358.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [7, 17913.0] + - - [3584, 2368, 1, 32, 3584, 3584, 32, 32] + - [18, 12378.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [25, 21919.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [41, 19632.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [32, 17264.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 20408.0] + - - [2944, 1024, 1, 1, 2944, 2944, 1, 1] + - [2, 321.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [41, 15739.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [15, 20482.0] + - - [4288, 2368, 1, 1, 4288, 4288, 1, 1] + - [31, 456.0] + - - [1408, 3584, 1, 32, 1408, 1408, 32, 32] + - [37, 10499.0] + - - [2944, 4288, 1, 32, 2944, 2944, 32, 32] + - [18, 14075.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 20685.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 21519.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [25, 21598.0] + - - [3584, 2944, 1, 1, 3584, 3584, 1, 1] + - [12, 340.0] + - - [1024, 1856, 1, 1, 1024, 1024, 1, 1] + - [18, 318.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [7, 19533.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 20175.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [49, 18145.0] + - - [1856, 1408, 1, 1, 1856, 1856, 1, 1] + - [37, 259.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21331.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 18247.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [32, 15275.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 19946.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [34, 20794.0] + - - [1024, 5056, 1, 32, 1024, 1024, 32, 32] + - [9, 9921.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [25, 16748.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [7, 19393.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [13, 13835.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 20863.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 19780.0] + - - [3584, 5888, 1, 1, 3584, 3584, 1, 1] + - [29, 529.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [49, 20265.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [25, 19141.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [41, 20158.0] + - - [5888, 1856, 1, 1, 5888, 5888, 1, 1] + - [45, 372.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 19725.0] + - - [1408, 4288, 1, 32, 1408, 1408, 32, 32] + - [28, 10277.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [15, 19071.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [49, 18893.0] + - - [5056, 1856, 1, 1, 5056, 5056, 1, 1] + - [0, 430.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [41, 19852.0] + - - [1408, 1024, 1, 32, 1408, 1408, 32, 32] + - [18, 8358.0] + - - [5056, 1856, 1, 32, 5056, 5056, 32, 32] + - [45, 11188.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 20179.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [41, 20107.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [25, 18881.0] + - - [5888, 6784, 1, 1, 5888, 5888, 1, 1] + - [46, 607.0] + - - [5888, 4288, 1, 32, 5888, 5888, 32, 32] + - [37, 15244.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [43, 19591.0] + - - [6784, 256, 1, 1, 6784, 6784, 1, 1] + - [18, 347.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 17758.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [49, 16885.0] + - - [2944, 1856, 1, 32, 2944, 2944, 32, 32] + - [18, 13307.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [32, 16465.0] + - - [2368, 1856, 1, 1, 2368, 2368, 1, 1] + - [18, 426.0] + - - [4288, 1408, 1, 1, 4288, 4288, 1, 1] + - [18, 420.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 20275.0] + - - [1408, 2944, 1, 1, 1408, 1408, 1, 1] + - [9, 378.0] + - - [4288, 1408, 1, 32, 4288, 4288, 32, 32] + - [0, 10321.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [32, 20688.0] + - - [1408, 2944, 1, 32, 1408, 1408, 32, 32] + - [20, 9914.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [49, 20990.0] + - - [6784, 5888, 1, 1, 6784, 6784, 1, 1] + - [48, 639.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [25, 22010.0] + - - [1024, 4288, 1, 32, 1024, 1024, 32, 32] + - [9, 8227.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [23, 20667.0] + - - [5056, 2368, 1, 1, 5056, 5056, 1, 1] + - [42, 513.0] + - - [5056, 448, 1, 1, 5056, 5056, 1, 1] + - [18, 333.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 19124.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [23, 19652.0] + - - [5056, 448, 1, 32, 5056, 5056, 32, 32] + - [28, 8108.0] + - - [3584, 2944, 1, 32, 3584, 3584, 32, 32] + - [12, 10829.0] + - - [3584, 1856, 1, 32, 3584, 3584, 32, 32] + - [31, 9520.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 20203.0] + - - [6784, 2368, 1, 1, 6784, 6784, 1, 1] + - [48, 375.0] + - - [704, 5056, 1, 1, 704, 704, 1, 1] + - [28, 335.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 18753.0] + - - [5888, 1408, 1, 1, 5888, 5888, 1, 1] + - [45, 448.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [43, 19426.0] + - - [1856, 4288, 1, 1, 1856, 1856, 1, 1] + - [42, 414.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [49, 19890.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [23, 19120.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [34, 16943.0] + - - [1856, 4288, 1, 32, 1856, 1856, 32, 32] + - [48, 11566.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 19990.0] + - - [4288, 6784, 1, 32, 4288, 4288, 32, 32] + - [47, 14823.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [13, 19150.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [5, 17422.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 16908.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [32, 19737.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [25, 18646.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [25, 20575.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [15, 20732.0] + - - [2944, 6784, 1, 32, 2944, 2944, 32, 32] + - [28, 16404.0] + - - [5056, 256, 1, 1, 5056, 5056, 1, 1] + - [33, 143.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 20294.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [49, 18296.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [5, 14259.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [13, 18561.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [13, 19678.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [42, 13919.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [23, 18661.0] + - - [1024, 1408, 1, 32, 1024, 1024, 32, 32] + - [31, 5479.0] + - - [6784, 704, 1, 1, 6784, 6784, 1, 1] + - [48, 346.0] + - - [704, 3584, 1, 32, 704, 704, 32, 32] + - [20, 6522.0] + - - [4288, 4288, 1, 1, 4288, 4288, 1, 1] + - [4, 740.0] + - - [5056, 2944, 1, 1, 5056, 5056, 1, 1] + - [12, 580.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [15, 21594.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [41, 20263.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [32, 18705.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [25, 20283.0] + - - [3584, 448, 1, 1, 3584, 3584, 1, 1] + - [40, 296.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [43, 19377.0] + - - [5888, 5056, 1, 32, 5888, 5888, 32, 32] + - [28, 14843.0] + - - [704, 2944, 1, 1, 704, 704, 1, 1] + - [33, 212.0] + - - [3584, 448, 1, 32, 3584, 3584, 32, 32] + - [18, 5865.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [41, 15653.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [43, 21425.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [32, 18590.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [43, 20980.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [25, 21785.0] + - - [1408, 1408, 1, 1, 1408, 1408, 1, 1] + - [2, 322.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [23, 19025.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [7, 20093.0] + - - [448, 6784, 1, 1, 448, 448, 1, 1] + - [42, 248.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [6, 14022.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [23, 15195.0] + - - [448, 6784, 1, 32, 448, 448, 32, 32] + - [37, 6849.0] + - - [1408, 1856, 1, 1, 1408, 1408, 1, 1] + - [37, 282.0] + - - [4288, 448, 1, 32, 4288, 4288, 32, 32] + - [0, 6015.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [23, 16220.0] + - - [2944, 704, 1, 32, 2944, 2944, 32, 32] + - [9, 6809.0] + - - [448, 4288, 1, 1, 448, 448, 1, 1] + - [33, 197.0] + - - [3584, 5056, 1, 1, 3584, 3584, 1, 1] + - [37, 719.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [25, 20629.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 17224.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [43, 21638.0] + - - [2368, 2368, 1, 32, 2368, 2368, 32, 32] + - [45, 9969.0] + - - [5888, 2944, 1, 32, 5888, 5888, 32, 32] + - [9, 13937.0] + - - [1856, 2944, 1, 32, 1856, 1856, 32, 32] + - [45, 10095.0] + - - [5056, 1408, 1, 1, 5056, 5056, 1, 1] + - [2, 404.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [13, 20201.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [41, 13510.0] + - - [6784, 1024, 1, 1, 6784, 6784, 1, 1] + - [26, 454.0] + - - [6784, 1024, 1, 32, 6784, 6784, 32, 32] + - [28, 12068.0] + - - [6784, 3584, 1, 1, 6784, 6784, 1, 1] + - [9, 574.0] + - - [2944, 2368, 1, 32, 2944, 2944, 32, 32] + - [45, 11535.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [25, 21497.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [41, 19622.0] + - - [5056, 1024, 1, 32, 5056, 5056, 32, 32] + - [18, 10053.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 20764.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [15, 20296.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [13, 17195.0] + - - [1856, 5888, 1, 1, 1856, 1856, 1, 1] + - [31, 468.0] + - - [256, 5888, 1, 32, 256, 256, 32, 32] + - [12, 7178.0] + - - [4288, 5056, 1, 32, 4288, 4288, 32, 32] + - [18, 14991.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [25, 21353.0] + - - [1856, 5888, 1, 32, 1856, 1856, 32, 32] + - [12, 10303.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [7, 20413.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [25, 17178.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 19970.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [23, 19940.0] + - - [2944, 4288, 1, 1, 2944, 2944, 1, 1] + - [2, 387.0] + - - [5056, 5888, 1, 32, 5056, 5056, 32, 32] + - [46, 15088.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [25, 20917.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [15, 21230.0] + - - [1024, 6784, 1, 32, 1024, 1024, 32, 32] + - [48, 10687.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [23, 20367.0] + - - [1408, 5888, 1, 1, 1408, 1408, 1, 1] + - [24, 437.0] + - - [704, 4288, 1, 32, 704, 704, 32, 32] + - [20, 6900.0] + - - [1408, 5888, 1, 32, 1408, 1408, 32, 32] + - [37, 12443.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [25, 20833.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [41, 19228.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [41, 19168.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [32, 18007.0] + - - [1408, 6784, 1, 1, 1408, 1408, 1, 1] + - [24, 396.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [25, 21611.0] + - - [1024, 5888, 1, 1, 1024, 1024, 1, 1] + - [22, 300.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [23, 18623.0] + - - [2368, 704, 1, 32, 2368, 2368, 32, 32] + - [9, 7915.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [32, 17398.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [7, 20057.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [23, 20158.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [32, 17003.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [15, 19768.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [15, 21011.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [21, 21706.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [15, 20652.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21294.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [15, 21408.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [13, 20332.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 17285.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [34, 20994.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [7, 21589.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 20339.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [15, 20195.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [21, 21538.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [51, 20670.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [15, 19071.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [15, 20483.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [43, 21912.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [15, 20429.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [32, 16358.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [15, 21722.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [15, 21815.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21359.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 17654.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [25, 21060.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 14063.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [15, 21282.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [15, 21239.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [33, 12672.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [15, 21167.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19175.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 21022.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [15, 21478.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 20058.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 18702.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [15, 21045.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [13, 20459.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [15, 21072.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [15, 20521.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [15, 21246.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [15, 21286.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [14, 14727.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 19982.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [19, 16611.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [15, 18944.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [13, 19997.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [49, 17054.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [15, 21577.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [13, 19053.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 17801.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [49, 18948.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [15, 21921.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [15, 20902.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 19665.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 16351.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [14, 14608.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [41, 20312.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [15, 20502.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [13, 20215.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [15, 18136.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [13, 19731.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 15268.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 20194.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [15, 20249.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [21, 21257.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [15, 21573.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [54, 17211.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [15, 20580.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [15, 21699.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [15, 15901.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [3, 18966.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [13, 20038.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [17, 16199.0] + - - [2048, 2048, 1, 2049, 2048, 2048, 2049, 2049] + - [32, 19760.0] + - - [8192, 8191, 1, 8192, 8192, 8192, 8192, 8192] + - [15, 17982.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [15, 18090.0] + - - [2047, 2048, 1, 2048, 2047, 2047, 2048, 2048] + - [13, 17851.0] + - - [2048, 2049, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 17826.0] + - - [8192, 8192, 1, 8191, 8192, 8192, 8191, 8191] + - [15, 21945.0] + - - [3072, 513, 1, 3072, 3072, 3072, 3072, 3072] + - [32, 13083.0] + - - [8191, 8192, 1, 8192, 8191, 8191, 8192, 8192] + - [15, 18025.0] + - - [8192, 8193, 1, 8192, 8192, 8192, 8192, 8192] + - [15, 18048.0] + - - [4096, 4097, 1, 4096, 4096, 4096, 4096, 4096] + - [51, 18174.0] + - - [8192, 8192, 1, 8193, 8192, 8192, 8193, 8193] + - [15, 21922.0] + - - [4096, 4095, 1, 4096, 4096, 4096, 4096, 4096] + - [17, 16035.0] + - - [4096, 4096, 1, 4097, 4096, 4096, 4097, 4097] + - [34, 20030.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 18838.0] + - - [4095, 4096, 1, 4096, 4095, 4095, 4096, 4096] + - [15, 16361.0] + - - [8193, 8192, 1, 8192, 8193, 8193, 8192, 8192] + - [15, 18015.0] + - - [4096, 4096, 1, 4095, 4096, 4096, 4095, 4095] + - [34, 20242.0] + - - [3072, 511, 1, 3072, 3072, 3072, 3072, 3072] + - [14, 16568.0] + - - [2049, 2048, 1, 2048, 2049, 2049, 2048, 2048] + - [15, 18975.0] + - - [2048, 2047, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 18456.0] + - - [2048, 2048, 1, 2047, 2048, 2048, 2047, 2047] + - [23, 20204.0] + - - [4097, 4096, 1, 4096, 4097, 4097, 4096, 4096] + - [36, 17083.0] + - - [128, 128, 512, 64, 128, 128, 64, 64] + - [24, 11307.0] + - - [512, 512, 64, 64, 512, 512, 64, 64] + - [49, 16963.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 17493.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16604.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 18966.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19239.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 20795.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21381.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 1024, 1024] + - [15, 21395.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 1024, 1024] + - [15, 21505.0] + - - [128, 32768, 1, 256, 128, 128, 256, 256] + - [13, 18519.0] + - - [256, 4608, 1, 1024, 256, 256, 1024, 1024] + - [15, 17949.0] + - - [256, 4864, 1, 1024, 256, 256, 1024, 1024] + - [15, 18857.0] + - - [256, 5376, 1, 1024, 256, 256, 1024, 1024] + - [13, 15750.0] + - - [256, 5888, 1, 1024, 256, 256, 1024, 1024] + - [13, 16999.0] + - - [256, 6144, 1, 1024, 256, 256, 1024, 1024] + - [13, 17454.0] + - - [256, 6400, 1, 1024, 256, 256, 1024, 1024] + - [13, 18145.0] + - - [256, 6656, 1, 1024, 256, 256, 1024, 1024] + - [34, 16675.0] + - - [256, 7168, 1, 1024, 256, 256, 1024, 1024] + - [15, 17988.0] + - - [256, 7424, 1, 1024, 256, 256, 1024, 1024] + - [15, 18570.0] + - - [256, 7936, 1, 1024, 256, 256, 1024, 1024] + - [13, 17313.0] + - - [256, 8192, 1, 1024, 256, 256, 1024, 1024] + - [13, 18119.0] + - - [256, 8448, 1, 1024, 256, 256, 1024, 1024] + - [13, 18138.0] + - - [256, 8960, 1, 1024, 256, 256, 1024, 1024] + - [13, 18957.0] + - - [256, 9984, 1, 1024, 256, 256, 1024, 1024] + - [15, 19321.0] + - - [256, 10496, 1, 1024, 256, 256, 1024, 1024] + - [32, 18033.0] + - - [256, 11264, 1, 1024, 256, 256, 1024, 1024] + - [13, 18911.0] + - - [256, 11520, 1, 1024, 256, 256, 1024, 1024] + - [13, 19324.0] + - - [256, 11776, 1, 1024, 256, 256, 1024, 1024] + - [34, 18390.0] + - - [256, 12544, 1, 1024, 256, 256, 1024, 1024] + - [51, 19585.0] + - - [256, 13312, 1, 1024, 256, 256, 1024, 1024] + - [13, 19053.0] + - - [256, 14336, 1, 1024, 256, 256, 1024, 1024] + - [34, 18902.0] + - - [256, 14592, 1, 1024, 256, 256, 1024, 1024] + - [13, 19263.0] + - - [256, 14848, 1, 1024, 256, 256, 1024, 1024] + - [34, 19501.0] + - - [256, 15104, 1, 1024, 256, 256, 1024, 1024] + - [34, 19827.0] + - - [256, 16128, 1, 1024, 256, 256, 1024, 1024] + - [13, 19188.0] + - - [256, 18176, 1, 1024, 256, 256, 1024, 1024] + - [32, 19301.0] + - - [256, 18944, 1, 1024, 256, 256, 1024, 1024] + - [13, 19669.0] + - - [256, 19200, 1, 1024, 256, 256, 1024, 1024] + - [32, 20249.0] + - - [256, 20480, 1, 1024, 256, 256, 1024, 1024] + - [34, 20353.0] + - - [256, 20992, 1, 1024, 256, 256, 1024, 1024] + - [13, 19621.0] + - - [256, 21248, 1, 1024, 256, 256, 1024, 1024] + - [32, 20240.0] + - - [256, 21504, 1, 1024, 256, 256, 1024, 1024] + - [32, 20380.0] + - - [256, 22016, 1, 1024, 256, 256, 1024, 1024] + - [34, 20219.0] + - - [256, 22344, 1, 1024, 256, 256, 1024, 1024] + - [51, 18799.0] + - - [256, 23296, 1, 1024, 256, 256, 1024, 1024] + - [49, 19669.0] + - - [256, 23552, 1, 1024, 256, 256, 1024, 1024] + - [32, 19865.0] + - - [256, 31488, 1, 1024, 256, 256, 1024, 1024] + - [13, 20429.0] + - - [256, 32768, 1, 512, 256, 256, 512, 512] + - [13, 20443.0] + - - [256, 33536, 1, 1024, 256, 256, 1024, 1024] + - [13, 20406.0] + - - [256, 44505, 1, 1024, 256, 256, 1024, 1024] + - [32, 20474.0] + - - [512, 32768, 1, 13, 512, 512, 13, 13] + - [0, 6175.0] + - - [512, 32768, 1, 1024, 512, 512, 1024, 1024] + - [49, 20799.0] + - - [684, 8976, 1, 256, 684, 684, 256, 256] + - [13, 16728.0] + - - [1024, 1600, 1, 560, 1024, 1024, 560, 560] + - [21, 16233.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 16555.0] + - - [1024, 32768, 1, 480, 1024, 1024, 480, 480] + - [7, 21587.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20998.0] + - - [1280, 8976, 1, 256, 1280, 1280, 256, 256] + - [41, 19669.0] + - - [1792, 8976, 1, 256, 1792, 1792, 256, 256] + - [32, 20225.0] + - - [2048, 684, 1, 512, 2048, 2048, 512, 512] + - [32, 15202.0] + - - [2048, 684, 1, 768, 2048, 2048, 768, 768] + - [23, 15934.0] + - - [2048, 960, 1, 74, 2048, 2048, 74, 74] + - [0, 8882.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 16389.0] + - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] + - [13, 18166.0] + - - [2048, 1536, 1, 768, 2048, 2048, 768, 768] + - [7, 18969.0] + - - [2048, 8976, 1, 256, 2048, 2048, 256, 256] + - [13, 20168.0] + - - [2304, 8976, 1, 256, 2304, 2304, 256, 256] + - [23, 20324.0] + - - [2560, 8976, 1, 256, 2560, 2560, 256, 256] + - [23, 20389.0] + - - [2816, 8976, 1, 256, 2816, 2816, 256, 256] + - [32, 20447.0] + - - [3072, 8976, 1, 256, 3072, 3072, 256, 256] + - [5, 20441.0] + - - [3328, 8976, 1, 256, 3328, 3328, 256, 256] + - [23, 20523.0] + - - [3840, 8976, 1, 256, 3840, 3840, 256, 256] + - [41, 20654.0] + - - [4096, 8976, 1, 256, 4096, 4096, 256, 256] + - [5, 20653.0] + - - [4352, 8976, 1, 256, 4352, 4352, 256, 256] + - [25, 20649.0] + - - [4608, 8976, 1, 256, 4608, 4608, 256, 256] + - [23, 20662.0] + - - [4864, 8976, 1, 256, 4864, 4864, 256, 256] + - [49, 20666.0] + - - [5120, 8976, 1, 256, 5120, 5120, 256, 256] + - [32, 20690.0] + - - [5376, 8976, 1, 256, 5376, 5376, 256, 256] + - [25, 20702.0] + - - [5632, 8976, 1, 256, 5632, 5632, 256, 256] + - [41, 20699.0] + - - [5888, 8976, 1, 256, 5888, 5888, 256, 256] + - [43, 20726.0] + - - [6144, 8976, 1, 256, 6144, 6144, 256, 256] + - [32, 20695.0] + - - [6400, 8976, 1, 256, 6400, 6400, 256, 256] + - [41, 20741.0] + - - [7168, 8976, 1, 256, 7168, 7168, 256, 256] + - [32, 20734.0] + - - [7936, 8976, 1, 256, 7936, 7936, 256, 256] + - [13, 20808.0] + - - [8192, 8976, 1, 256, 8192, 8192, 256, 256] + - [23, 20785.0] + - - [8448, 8976, 1, 256, 8448, 8448, 256, 256] + - [15, 20833.0] + - - [8960, 8976, 1, 256, 8960, 8960, 256, 256] + - [5, 20824.0] + - - [9472, 8976, 1, 256, 9472, 9472, 256, 256] + - [25, 20969.0] + - - [9728, 8976, 1, 256, 9728, 9728, 256, 256] + - [15, 20961.0] + - - [9984, 8976, 1, 256, 9984, 9984, 256, 256] + - [25, 21046.0] + - - [10240, 8976, 1, 256, 10240, 10240, 256, 256] + - [5, 20832.0] + - - [10496, 8976, 1, 256, 10496, 10496, 256, 256] + - [25, 21039.0] + - - [11264, 8976, 1, 256, 11264, 11264, 256, 256] + - [15, 20945.0] + - - [11776, 8976, 1, 256, 11776, 11776, 256, 256] + - [23, 20975.0] + - - [12544, 8976, 1, 256, 12544, 12544, 256, 256] + - [15, 21051.0] + - - [13312, 8976, 1, 256, 13312, 13312, 256, 256] + - [15, 21000.0] + - - [13568, 8976, 1, 256, 13568, 13568, 256, 256] + - [25, 21088.0] + - - [13824, 8976, 1, 256, 13824, 13824, 256, 256] + - [15, 21020.0] + - - [15104, 8976, 1, 256, 15104, 15104, 256, 256] + - [15, 21106.0] + - - [15360, 8976, 1, 256, 15360, 15360, 256, 256] + - [32, 21001.0] + - - [15872, 8976, 1, 256, 15872, 15872, 256, 256] + - [15, 21018.0] + - - [16128, 8976, 1, 256, 16128, 16128, 256, 256] + - [25, 21117.0] + - - [17152, 8976, 1, 256, 17152, 17152, 256, 256] + - [25, 21105.0] + - - [18176, 8976, 1, 256, 18176, 18176, 256, 256] + - [25, 21131.0] + - - [18688, 8976, 1, 256, 18688, 18688, 256, 256] + - [25, 21119.0] + - - [18944, 8976, 1, 256, 18944, 18944, 256, 256] + - [15, 21064.0] + - - [19712, 8976, 1, 256, 19712, 19712, 256, 256] + - [25, 21143.0] + - - [19968, 8976, 1, 256, 19968, 19968, 256, 256] + - [15, 21063.0] + - - [20480, 8976, 1, 256, 20480, 20480, 256, 256] + - [23, 21012.0] + - - [20992, 8976, 1, 256, 20992, 20992, 256, 256] + - [15, 21039.0] + - - [21248, 8976, 1, 256, 21248, 21248, 256, 256] + - [25, 21151.0] + - - [23552, 8976, 1, 256, 23552, 23552, 256, 256] + - [15, 21086.0] + - - [28672, 8976, 1, 256, 28672, 28672, 256, 256] + - [15, 21103.0] + - - [31488, 8976, 1, 256, 31488, 31488, 256, 256] + - [7, 21160.0] + - - [33536, 8976, 1, 256, 33536, 33536, 256, 256] + - [7, 21086.0] + - - [44505, 8976, 1, 256, 44505, 44505, 256, 256] + - [7, 21090.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20510.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 21018.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 18892.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19130.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 20096.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 18296.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [34, 18669.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 4096, 4096] + - [36, 16245.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 19288.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 4096, 4096] + - [36, 16146.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 19556.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 4096, 4096] + - [54, 15679.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 19777.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 4096, 4096] + - [36, 15727.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20129.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 4096, 4096] + - [52, 13679.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 20187.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 4096, 4096] + - [36, 16510.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20690.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 4096, 4096] + - [54, 17418.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20791.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19000.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20910.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19473.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21400.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 20888.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21179.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21437.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 20977.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21193.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21475.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21554.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21122.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21143.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21238.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21400.0] + - - [42720, 3968, 1, 1024, 42720, 42720, 1024, 1024] + - [34, 21155.0] + - - [42720, 6528, 1, 1024, 42720, 42720, 1024, 1024] + - [51, 21279.0] + - - [42720, 7104, 1, 1024, 42720, 42720, 1024, 1024] + - [15, 21270.0] + - - [42720, 7200, 1, 1024, 42720, 42720, 1024, 1024] + - [51, 20893.0] + - - [42720, 9520, 1, 1024, 42720, 42720, 1024, 1024] + - [15, 20995.0] + - - [42720, 10080, 1, 1024, 42720, 42720, 1024, 1024] + - [15, 21104.0] + - - [1024, 3240, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 18185.0] + - - [1024, 3240, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17465.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 17632.0] + - - [1024, 3960, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 18428.0] + - - [4096, 3240, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 20595.0] + - - [4096, 3960, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 20602.0] + - - [42720, 3960, 1, 1024, 42720, 42720, 1024, 1024] + - [34, 21216.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [25, 18582.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [54, 17226.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [15, 18869.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [15, 18190.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [49, 17969.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 18585.0] + - - [512, 512, 16, 64, 512, 512, 64, 64] + - [33, 12509.0] + - - [512, 512, 128, 64, 512, 512, 64, 64] + - [42, 16976.0] + - - [4096, 512, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 17905.0] + - - [30522, 616, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 19930.0] + - - [128, 128, 128, 64, 128, 128, 64, 64] + - [42, 9883.0] + - - [128, 128, 160, 64, 128, 128, 64, 64] + - [4, 11716.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 19554.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19689.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 20171.0] + - - [30522, 160, 1, 1024, 30522, 30522, 1024, 1024] + - [32, 13011.0] + - - [30522, 200, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 16000.0] + - - [128, 128, 624, 64, 128, 128, 64, 64] + - [6, 15259.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20226.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 20219.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21046.0] + - - [30522, 780, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 18200.0] + - - [30522, 308, 1, 1024, 30522, 30522, 1024, 1024] + - [32, 16575.0] + - - [128, 128, 640, 64, 128, 128, 64, 64] + - [40, 12662.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20735.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 18292.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21676.0] + - - [30522, 800, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 18675.0] + - - [128, 128, 656, 64, 128, 128, 64, 64] + - [50, 14806.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 19381.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 18198.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21021.0] + - - [30522, 820, 1, 1024, 30522, 30522, 1024, 1024] + - [13, 19085.0] + - - [512, 512, 80, 64, 512, 512, 64, 64] + - [6, 17059.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20229.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 20553.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21324.0] + - - [30522, 385, 1, 1024, 30522, 30522, 1024, 1024] + - [51, 15828.0] + - - [512, 512, 96, 64, 512, 512, 64, 64] + - [33, 17015.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 19119.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19639.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 20809.0] + - - [30522, 462, 1, 1024, 30522, 30522, 1024, 1024] + - [15, 18815.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19281.0] + - - [128, 128, 144, 64, 128, 128, 64, 64] + - [22, 8956.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 16511.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 17793.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 19168.0] + - - [30522, 180, 1, 1024, 30522, 30522, 1024, 1024] + - [32, 14600.0] + - - [1024, 32768, 1, 479, 1024, 1024, 479, 479] + - [7, 21265.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20841.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 4096, 4096] + - [54, 15604.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 20534.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 4096, 4096] + - [54, 16667.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21674.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21342.0] + - - [33712, 8192, 1, 1024, 33712, 33712, 1024, 1024] + - [15, 20991.0] + - - [33712, 9600, 1, 1024, 33712, 33712, 1024, 1024] + - [15, 20419.0] + - - [1024, 1024, 128, 96, 1024, 1024, 96, 96] + - [5, 19924.0] + - - [30592, 4096, 1, 1024, 30592, 30592, 1024, 1024] + - [15, 21622.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [15, 21211.0] + - - [3072, 8192, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21619.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 20091.0] + - - [50304, 8192, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 21222.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 8192, 8192] + - [15, 16317.0] + - - [50304, 2048, 1, 1024, 50304, 50304, 1024, 1024] + - [34, 22040.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 6144, 6144] + - [51, 19228.0] + - - [50304, 4096, 1, 1536, 50304, 50304, 1536, 1536] + - [15, 21051.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 2048, 2048] + - [13, 19865.0] + - - [2560, 2048, 1, 640, 2560, 2560, 640, 640] + - [7, 20383.0] + - - [1024, 1024, 128, 64, 1024, 1024, 64, 64] + - [13, 17705.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 17793.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [15, 20675.0] + - - [1024, 1024, 64, 64, 1024, 1024, 64, 64] + - [5, 17509.0] + - - [30592, 8192, 1, 1024, 30592, 30592, 1024, 1024] + - [15, 20817.0] + - - [50304, 16384, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 21436.0] + - - [4608, 4096, 1, 1536, 4608, 4608, 1536, 1536] + - [15, 21679.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [15, 21221.0] + - - [7680, 2048, 1, 2560, 7680, 7680, 2560, 2560] + - [43, 20429.0] + - - [50304, 4096, 1, 1024, 50304, 50304, 1024, 1024] + - [34, 21062.0] + - - [1920, 2048, 1, 2560, 1920, 1920, 2560, 2560] + - [51, 19781.0] + - - [1024, 1024, 64, 96, 1024, 1024, 96, 96] + - [13, 19447.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 1536, 1536] + - [15, 21587.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 6144, 6144] + - [34, 18681.0] + - - [512, 512, 256, 64, 512, 512, 64, 64] + - [13, 15991.0] + - - [50304, 8192, 1, 1536, 50304, 50304, 1536, 1536] + - [51, 21150.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 1536, 1536] + - [15, 21554.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21285.0] + - - [30592, 1024, 1, 2048, 30592, 30592, 2048, 2048] + - [34, 20446.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 18632.0] + - - [512, 512, 40, 64, 512, 512, 64, 64] + - [6, 14814.0] + - - [6144, 1024, 1, 2048, 6144, 6144, 2048, 2048] + - [15, 20313.0] + - - [4608, 8192, 1, 1536, 4608, 4608, 1536, 1536] + - [15, 21615.0] + - - [30592, 2048, 1, 1024, 30592, 30592, 1024, 1024] + - [15, 21414.0] + - - [3072, 16384, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21237.0] + - - [1024, 1024, 256, 64, 1024, 1024, 64, 64] + - [5, 17750.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 21344.0] + - - [1024, 1024, 32, 64, 1024, 1024, 64, 64] + - [14, 17279.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 20844.0] + - - [30528, 8192, 1, 1024, 30528, 30528, 1024, 1024] + - [15, 20660.0] + - - [128, 128, 1024, 64, 128, 128, 64, 64] + - [48, 12996.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 19073.0] + - - [1024, 3456, 1, 480, 1024, 1024, 480, 480] + - [32, 19396.0] + - - [1024, 4096, 1, 480, 1024, 1024, 480, 480] + - [5, 19475.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20180.0] + - - [1024, 6912, 1, 480, 1024, 1024, 480, 480] + - [7, 20863.0] + - - [128, 55296, 1, 256, 128, 128, 256, 256] + - [49, 19687.0] + - - [256, 55296, 1, 512, 256, 256, 512, 512] + - [49, 20466.0] + - - [256, 6912, 1, 512, 256, 256, 512, 512] + - [13, 17490.0] + - - [512, 3456, 1, 1024, 512, 512, 1024, 1024] + - [49, 17293.0] + - - [512, 3456, 1, 13, 512, 512, 13, 13] + - [22, 2014.0] + - - [512, 4096, 1, 1024, 512, 512, 1024, 1024] + - [49, 17730.0] + - - [512, 4096, 1, 13, 512, 512, 13, 13] + - [19, 3341.0] + - - [512, 55296, 1, 13, 512, 512, 13, 13] + - [42, 7713.0] + - - [512, 6912, 1, 1024, 512, 512, 1024, 1024] + - [32, 19264.0] + - - [512, 6912, 1, 13, 512, 512, 13, 13] + - [40, 4657.0] + - - [30528, 640, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 20817.0] + - - [30528, 1280, 1, 1024, 30528, 30528, 1024, 1024] + - [51, 21353.0] + - - [30528, 1600, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 20281.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 21159.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21525.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19518.0] + - - [128, 128, 1280, 64, 128, 128, 64, 64] + - [41, 17050.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 19573.0] + - - [30528, 1640, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 20741.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21453.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20645.0] + - - [128, 128, 1312, 64, 128, 128, 64, 64] + - [24, 13995.0] + - - [30528, 160, 1, 1024, 30528, 30528, 1024, 1024] + - [32, 12859.0] + - - [30528, 240, 1, 1024, 30528, 30528, 1024, 1024] + - [49, 19172.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20247.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21508.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 19358.0] + - - [512, 512, 192, 64, 512, 512, 64, 64] + - [32, 17606.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20978.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 21003.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 21127.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 19417.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21437.0] + - - [3072, 10224, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21529.0] + - - [3072, 10240, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21724.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 18896.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21302.0] + - - [3072, 10192, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21454.0] + - - [3072, 10200, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21482.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 21016.0] + - - [3072, 10208, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21596.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19524.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21399.0] + - - [2048, 10224, 1, 1024, 2048, 2048, 1024, 1024] + - [15, 21417.0] + - - [2048, 10240, 1, 1024, 2048, 2048, 1024, 1024] + - [15, 21631.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20929.0] + - - [2048, 10192, 1, 1024, 2048, 2048, 1024, 1024] + - [15, 21382.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20934.0] + - - [3072, 10080, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21391.0] + - - [1024, 2048, 1, 49, 1024, 1024, 49, 49] + - [28, 10055.0] + - - [4608, 512, 1, 49, 4608, 4608, 49, 49] + - [18, 11469.0] + - - [256, 256, 25, 12544, 256, 256, 12544, 12544] + - [35, 15436.0] + - - [256, 256, 49, 3200, 256, 256, 3200, 3200] + - [43, 21000.0] + - - [256, 256, 25, 6272, 256, 256, 6272, 6272] + - [5, 19940.0] + - - [256, 256, 49, 6400, 256, 256, 6400, 6400] + - [34, 18659.0] + - - [512, 512, 49, 1152, 512, 512, 1152, 1152] + - [43, 21570.0] + - - [512, 512, 25, 2048, 512, 512, 2048, 2048] + - [32, 18355.0] + - - [512, 512, 49, 2304, 512, 512, 2304, 2304] + - [15, 20824.0] + - - [512, 512, 25, 4096, 512, 512, 4096, 4096] + - [36, 17887.0] + - - [128, 128, 2048, 64, 128, 128, 64, 64] + - [22, 12090.0] + - - [30528, 2560, 1, 1024, 30528, 30528, 1024, 1024] + - [15, 21520.0] + - - [128, 128, 1536, 64, 128, 128, 64, 64] + - [50, 14738.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 20947.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 18801.0] + - - [30528, 1920, 1, 1024, 30528, 30528, 1024, 1024] + - [13, 21144.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21465.0] + - - [128, 128, 81, 12544, 128, 128, 12544, 12544] + - [35, 12593.0] + - - [128, 128, 121, 9216, 128, 128, 9216, 9216] + - [52, 12491.0] + - - [128, 128, 169, 6400, 128, 128, 6400, 6400] + - [16, 14827.0] + - - [256, 256, 36, 4096, 256, 256, 4096, 4096] + - [36, 15856.0] + - - [256, 256, 49, 2304, 256, 256, 2304, 2304] + - [51, 18995.0] + - - [256, 256, 64, 2304, 256, 256, 2304, 2304] + - [51, 17753.0] + - - [256, 256, 81, 4096, 256, 256, 4096, 4096] + - [54, 16097.0] + - - [256, 256, 121, 2304, 256, 256, 2304, 2304] + - [51, 18568.0] + - - [256, 256, 169, 2304, 256, 256, 2304, 2304] + - [51, 19661.0] + - - [512, 512, 81, 1024, 512, 512, 1024, 1024] + - [15, 20348.0] + - - [512, 512, 121, 1024, 512, 512, 1024, 1024] + - [15, 20402.0] + - - [512, 512, 169, 1024, 512, 512, 1024, 1024] + - [15, 20631.0] + - - [512, 512, 36, 1024, 512, 512, 1024, 1024] + - [15, 19942.0] + - - [512, 512, 49, 1024, 512, 512, 1024, 1024] + - [15, 20151.0] + - - [512, 512, 64, 1024, 512, 512, 1024, 1024] + - [15, 20027.0] + - - [128, 128, 192, 64, 128, 128, 64, 64] + - [4, 12172.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [13, 18578.0] + - - [3072, 2048, 1, 768, 3072, 3072, 768, 768] + - [32, 20195.0] + - - [768, 2048, 1, 3072, 768, 768, 3072, 3072] + - [32, 17091.0] + - - [384, 384, 144, 64, 384, 384, 64, 64] + - [41, 17690.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [49, 19386.0] + - - [3072, 4608, 1, 768, 3072, 3072, 768, 768] + - [7, 21234.0] + - - [768, 4608, 1, 3072, 768, 768, 3072, 3072] + - [15, 19413.0] + - - [512, 512, 48, 64, 512, 512, 64, 64] + - [24, 16190.0] + - - [128, 128, 256, 64, 128, 128, 64, 64] + - [22, 11074.0] + - - [384, 384, 192, 64, 384, 384, 64, 64] + - [49, 17969.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [13, 19605.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21464.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 19301.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [15, 20865.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [15, 21124.0] + - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] + - [14, 16792.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [13, 16421.0] + - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] + - [32, 17709.0] + - - [512, 4096, 1, 4096, 512, 512, 4096, 4096] + - [32, 17928.0] + - - [512, 8192, 1, 8192, 512, 512, 8192, 8192] + - [54, 16468.0] + - - [8192, 512, 1, 8192, 8192, 8192, 8192, 8192] + - [15, 17951.0] + - - [256, 256, 36, 432, 256, 256, 432, 432] + - [15, 17664.0] + - - [256, 256, 36, 456, 256, 256, 456, 456] + - [34, 17683.0] + - - [256, 256, 36, 504, 256, 256, 504, 504] + - [1, 17144.0] + - - [256, 256, 49, 1120, 256, 256, 1120, 1120] + - [7, 20696.0] + - - [256, 256, 36, 442, 256, 256, 442, 442] + - [43, 16178.0] + - - [256, 256, 49, 950, 256, 256, 950, 950] + - [25, 19375.0] + - - [256, 256, 64, 616, 256, 256, 616, 616] + - [41, 18717.0] + - - [256, 256, 64, 660, 256, 256, 660, 660] + - [23, 19070.0] + - - [256, 256, 36, 408, 256, 256, 408, 408] + - [43, 16097.0] + - - [256, 256, 49, 1008, 256, 256, 1008, 1008] + - [51, 17375.0] + - - [256, 256, 36, 462, 256, 256, 462, 462] + - [7, 16411.0] + - - [256, 256, 36, 468, 256, 256, 468, 468] + - [43, 16411.0] + - - [256, 256, 36, 494, 256, 256, 494, 494] + - [41, 16574.0] + - - [512, 512, 64, 48, 512, 512, 48, 48] + - [50, 16080.0] + - - [256, 256, 64, 140, 256, 256, 140, 140] + - [42, 14636.0] + - - [512, 512, 64, 56, 512, 512, 56, 56] + - [19, 15372.0] + - - [512, 512, 49, 90, 512, 512, 90, 90] + - [42, 16142.0] + - - [512, 512, 49, 60, 512, 512, 60, 60] + - [24, 14547.0] + - - [256, 256, 49, 864, 256, 256, 864, 864] + - [43, 19487.0] + - - [256, 256, 64, 224, 256, 256, 224, 224] + - [41, 17680.0] + - - [256, 256, 64, 176, 256, 256, 176, 176] + - [23, 16800.0] + - - [256, 256, 64, 154, 256, 256, 154, 154] + - [41, 14869.0] + - - [512, 512, 49, 80, 512, 512, 80, 80] + - [49, 17219.0] + - - [256, 256, 49, 1200, 256, 256, 1200, 1200] + - [7, 20627.0] + - - [256, 256, 64, 704, 256, 256, 704, 704] + - [23, 19396.0] + - - [256, 256, 64, 768, 256, 256, 768, 768] + - [5, 18652.0] + - - [256, 256, 49, 1160, 256, 256, 1160, 1160] + - [5, 15326.0] + - - [256, 256, 49, 320, 256, 256, 320, 320] + - [51, 17971.0] + - - [512, 512, 49, 70, 512, 512, 70, 70] + - [33, 15081.0] + - - [256, 256, 49, 1240, 256, 256, 1240, 1240] + - [7, 20536.0] + - - [256, 256, 36, 384, 256, 256, 384, 384] + - [23, 17389.0] + - - [1024, 2048, 1, 888, 1024, 1024, 888, 888] + - [1, 18301.0] + - - [1024, 2048, 1, 713, 1024, 1024, 713, 713] + - [5, 17521.0] + - - [1024, 2048, 1, 660, 1024, 1024, 660, 660] + - [1, 17677.0] + - - [1024, 2048, 1, 726, 1024, 1024, 726, 726] + - [1, 17659.0] + - - [1024, 2048, 1, 672, 1024, 1024, 672, 672] + - [1, 18485.0] + - - [1024, 2048, 1, 850, 1024, 1024, 850, 850] + - [5, 17719.0] + - - [1024, 2048, 1, 805, 1024, 1024, 805, 805] + - [19, 17756.0] + - - [1024, 2048, 1, 864, 1024, 1024, 864, 864] + - [5, 17969.0] + - - [1024, 2048, 1, 768, 1024, 1024, 768, 768] + - [13, 17491.0] + - - [1024, 2048, 1, 950, 1024, 1024, 950, 950] + - [1, 17952.0] + - - [256, 128, 49, 1152, 256, 256, 1152, 1152] + - [5, 18270.0] + - - [256, 128, 121, 120, 256, 256, 120, 120] + - [24, 13086.0] + - - [256, 128, 169, 120, 256, 256, 120, 120] + - [24, 15454.0] + - - [256, 128, 36, 120, 256, 256, 120, 120] + - [1, 9983.0] + - - [256, 128, 49, 120, 256, 256, 120, 120] + - [14, 10824.0] + - - [256, 128, 64, 120, 256, 256, 120, 120] + - [50, 11470.0] + - - [256, 128, 36, 12000, 256, 256, 12000, 12000] + - [49, 18332.0] + - - [256, 128, 49, 1216, 256, 256, 1216, 1216] + - [5, 19086.0] + - - [256, 128, 121, 18, 256, 256, 18, 18] + - [4, 4991.0] + - - [256, 128, 169, 18, 256, 256, 18, 18] + - [4, 5651.0] + - - [256, 128, 36, 18, 256, 256, 18, 18] + - [0, 4719.0] + - - [256, 128, 49, 18, 256, 256, 18, 18] + - [9, 4833.0] + - - [256, 128, 64, 18, 256, 256, 18, 18] + - [9, 4815.0] + - - [256, 128, 36, 1800, 256, 256, 1800, 1800] + - [5, 17778.0] + - - [256, 128, 121, 19, 256, 256, 19, 19] + - [12, 5483.0] + - - [256, 128, 169, 19, 256, 256, 19, 19] + - [4, 6377.0] + - - [256, 128, 36, 19, 256, 256, 19, 19] + - [9, 3013.0] + - - [256, 128, 49, 19, 256, 256, 19, 19] + - [4, 3459.0] + - - [256, 128, 64, 19, 256, 256, 19, 19] + - [0, 3977.0] + - - [256, 128, 36, 1900, 256, 256, 1900, 1900] + - [41, 17201.0] + - - [256, 128, 49, 480, 256, 256, 480, 480] + - [41, 16567.0] + - - [256, 128, 81, 480, 256, 256, 480, 480] + - [5, 16512.0] + - - [256, 128, 64, 5880, 256, 256, 5880, 5880] + - [5, 17576.0] + - - [256, 128, 49, 72, 256, 256, 72, 72] + - [4, 10194.0] + - - [256, 128, 81, 72, 256, 256, 72, 72] + - [4, 12458.0] + - - [256, 128, 49, 76, 256, 256, 76, 76] + - [4, 10876.0] + - - [256, 128, 81, 76, 256, 256, 76, 76] + - [4, 11838.0] + - - [256, 128, 49, 7680, 256, 256, 7680, 7680] + - [53, 13712.0] + - - [256, 128, 64, 882, 256, 256, 882, 882] + - [5, 18011.0] + - - [256, 128, 64, 931, 256, 256, 931, 931] + - [23, 18149.0] + - - [256, 256, 49, 1152, 256, 256, 1152, 1152] + - [7, 20639.0] + - - [256, 256, 36, 12000, 256, 256, 12000, 12000] + - [25, 19767.0] + - - [256, 256, 49, 1216, 256, 256, 1216, 1216] + - [21, 20813.0] + - - [256, 256, 36, 1800, 256, 256, 1800, 1800] + - [7, 19185.0] + - - [256, 256, 36, 1900, 256, 256, 1900, 1900] + - [15, 15230.0] + - - [256, 256, 64, 5880, 256, 256, 5880, 5880] + - [5, 20661.0] + - - [256, 256, 49, 7680, 256, 256, 7680, 7680] + - [36, 17451.0] + - - [256, 256, 64, 882, 256, 256, 882, 882] + - [5, 20129.0] + - - [256, 256, 64, 931, 256, 256, 931, 931] + - [41, 20013.0] + - - [512, 256, 81, 1080, 512, 512, 1080, 1080] + - [43, 20944.0] + - - [512, 256, 25, 12000, 512, 512, 12000, 12000] + - [25, 21915.0] + - - [512, 256, 81, 162, 512, 512, 162, 162] + - [5, 18458.0] + - - [512, 256, 81, 171, 512, 512, 171, 171] + - [41, 19094.0] + - - [512, 256, 25, 1800, 512, 512, 1800, 1800] + - [7, 21316.0] + - - [512, 256, 25, 1900, 512, 512, 1900, 1900] + - [27, 16179.0] + - - [512, 256, 121, 1920, 512, 512, 1920, 1920] + - [43, 21679.0] + - - [512, 256, 169, 1920, 512, 512, 1920, 1920] + - [25, 22068.0] + - - [512, 256, 49, 1920, 512, 512, 1920, 1920] + - [43, 21324.0] + - - [512, 256, 121, 288, 512, 512, 288, 288] + - [41, 20575.0] + - - [512, 256, 169, 288, 512, 512, 288, 288] + - [43, 20795.0] + - - [512, 256, 49, 288, 512, 512, 288, 288] + - [41, 19758.0] + - - [512, 256, 25, 3000, 512, 512, 3000, 3000] + - [25, 21569.0] + - - [512, 256, 81, 3000, 512, 512, 3000, 3000] + - [43, 21402.0] + - - [512, 256, 121, 304, 512, 512, 304, 304] + - [23, 20622.0] + - - [512, 256, 169, 304, 512, 512, 304, 304] + - [7, 20865.0] + - - [512, 256, 49, 304, 512, 512, 304, 304] + - [7, 19790.0] + - - [512, 256, 25, 450, 512, 512, 450, 450] + - [5, 19484.0] + - - [512, 256, 81, 450, 512, 512, 450, 450] + - [5, 20121.0] + - - [512, 256, 25, 475, 512, 512, 475, 475] + - [23, 19593.0] + - - [512, 256, 81, 475, 512, 512, 475, 475] + - [5, 19969.0] + - - [512, 256, 121, 480, 512, 512, 480, 480] + - [25, 21039.0] + - - [512, 256, 169, 480, 512, 512, 480, 480] + - [7, 21387.0] + - - [512, 256, 49, 5880, 512, 512, 5880, 5880] + - [25, 21678.0] + - - [512, 256, 121, 72, 512, 512, 72, 72] + - [24, 16355.0] + - - [512, 256, 169, 72, 512, 512, 72, 72] + - [5, 17043.0] + - - [512, 256, 121, 76, 512, 512, 76, 76] + - [24, 16718.0] + - - [512, 256, 169, 76, 512, 512, 76, 76] + - [5, 17053.0] + - - [512, 256, 49, 882, 512, 512, 882, 882] + - [7, 20881.0] + - - [512, 256, 49, 931, 512, 512, 931, 931] + - [25, 20750.0] + - - [2304, 512, 1, 100, 2304, 2304, 100, 100] + - [19, 13497.0] + - - [2304, 512, 1, 361, 2304, 2304, 361, 361] + - [1, 17034.0] + - - [4608, 510, 1, 100, 4608, 4608, 100, 100] + - [0, 10218.0] + - - [4608, 510, 1, 361, 4608, 4608, 361, 361] + - [41, 15875.0] + - - [340, 256, 49, 1152, 340, 340, 1152, 1152] + - [5, 18095.0] + - - [340, 256, 36, 120, 340, 340, 120, 120] + - [42, 13381.0] + - - [340, 256, 49, 120, 340, 340, 120, 120] + - [42, 13847.0] + - - [340, 256, 64, 120, 340, 340, 120, 120] + - [5, 14259.0] + - - [340, 256, 36, 12000, 340, 340, 12000, 12000] + - [23, 18373.0] + - - [340, 256, 49, 1216, 340, 340, 1216, 1216] + - [5, 18034.0] + - - [340, 256, 36, 18, 340, 340, 18, 18] + - [22, 3922.0] + - - [340, 256, 49, 18, 340, 340, 18, 18] + - [31, 4342.0] + - - [340, 256, 64, 18, 340, 340, 18, 18] + - [4, 4545.0] + - - [340, 256, 36, 1800, 340, 340, 1800, 1800] + - [23, 17932.0] + - - [340, 256, 36, 19, 340, 340, 19, 19] + - [40, 4193.0] + - - [340, 256, 49, 19, 340, 340, 19, 19] + - [40, 4568.0] + - - [340, 256, 64, 19, 340, 340, 19, 19] + - [4, 4768.0] + - - [340, 256, 36, 1900, 340, 340, 1900, 1900] + - [5, 17949.0] + - - [340, 256, 64, 5880, 340, 340, 5880, 5880] + - [25, 18963.0] + - - [340, 256, 49, 7680, 340, 340, 7680, 7680] + - [27, 14460.0] + - - [340, 256, 64, 882, 340, 340, 882, 882] + - [7, 18042.0] + - - [340, 256, 64, 931, 340, 340, 931, 931] + - [25, 18014.0] + - - [510, 256, 49, 120, 510, 510, 120, 120] + - [42, 14941.0] + - - [510, 256, 64, 120, 510, 510, 120, 120] + - [24, 15479.0] + - - [510, 256, 49, 18, 510, 510, 18, 18] + - [20, 3823.0] + - - [510, 256, 64, 18, 510, 510, 18, 18] + - [18, 3915.0] + - - [510, 256, 49, 19, 510, 510, 19, 19] + - [20, 4033.0] + - - [510, 256, 64, 19, 510, 510, 19, 19] + - [33, 4098.0] + - - [510, 256, 36, 480, 510, 510, 480, 480] + - [23, 19312.0] + - - [510, 256, 36, 72, 510, 510, 72, 72] + - [42, 10723.0] + - - [510, 256, 36, 76, 510, 510, 76, 76] + - [33, 11094.0] + - - [510, 512, 36, 1080, 510, 510, 1080, 1080] + - [43, 20713.0] + - - [510, 512, 36, 162, 510, 510, 162, 162] + - [43, 16989.0] + - - [510, 512, 36, 171, 510, 510, 171, 171] + - [32, 17672.0] + - - [510, 512, 49, 1920, 510, 510, 1920, 1920] + - [43, 21523.0] + - - [510, 512, 64, 1920, 510, 510, 1920, 1920] + - [25, 21671.0] + - - [510, 512, 49, 288, 510, 510, 288, 288] + - [41, 19923.0] + - - [510, 512, 64, 288, 510, 510, 288, 288] + - [25, 20425.0] + - - [510, 512, 36, 3000, 510, 510, 3000, 3000] + - [25, 21320.0] + - - [510, 512, 49, 304, 510, 510, 304, 304] + - [43, 20120.0] + - - [510, 512, 64, 304, 510, 510, 304, 304] + - [25, 20500.0] + - - [510, 512, 36, 450, 510, 510, 450, 450] + - [41, 20123.0] + - - [510, 512, 36, 475, 510, 510, 475, 475] + - [23, 20166.0] + - - [510, 512, 49, 480, 510, 510, 480, 480] + - [25, 20792.0] + - - [510, 512, 64, 480, 510, 510, 480, 480] + - [43, 20913.0] + - - [510, 512, 49, 72, 510, 510, 72, 72] + - [41, 12968.0] + - - [510, 512, 64, 72, 510, 510, 72, 72] + - [49, 13633.0] + - - [510, 512, 49, 76, 510, 510, 76, 76] + - [23, 13801.0] + - - [510, 512, 64, 76, 510, 510, 76, 76] + - [23, 14204.0] + - - [1024, 1024, 160, 96, 1024, 1024, 96, 96] + - [5, 19987.0] + - - [2880, 16384, 1, 1920, 2880, 2880, 1920, 1920] + - [43, 21790.0] + - - [1920, 16384, 1, 960, 1920, 1920, 960, 960] + - [21, 22071.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 1920, 1920] + - [7, 22351.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 3840, 3840] + - [15, 21995.0] + - - [25216, 16384, 1, 1920, 25216, 25216, 1920, 1920] + - [43, 22164.0] + - - [1024, 1024, 40, 96, 1024, 1024, 96, 96] + - [41, 19299.0] + - - [2880, 4096, 1, 1920, 2880, 2880, 1920, 1920] + - [3, 21370.0] + - - [1920, 4096, 1, 960, 1920, 1920, 960, 960] + - [21, 21581.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 1920, 1920] + - [3, 22256.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 3840, 3840] + - [43, 20977.0] + - - [25216, 4096, 1, 1920, 25216, 25216, 1920, 1920] + - [51, 22343.0] + - - [1024, 1024, 80, 96, 1024, 1024, 96, 96] + - [5, 19710.0] + - - [2880, 8192, 1, 1920, 2880, 2880, 1920, 1920] + - [43, 21685.0] + - - [1920, 8192, 1, 960, 1920, 1920, 960, 960] + - [3, 22131.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 1920, 1920] + - [7, 22252.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 3840, 3840] + - [15, 21707.0] + - - [25216, 8192, 1, 1920, 25216, 25216, 1920, 1920] + - [7, 22144.0] + - - [1024, 1024, 96, 96, 1024, 1024, 96, 96] + - [5, 19746.0] + - - [1728, 16384, 1, 2304, 1728, 1728, 2304, 2304] + - [7, 21331.0] + - - [2304, 16384, 1, 576, 2304, 2304, 576, 576] + - [21, 22380.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [7, 22190.0] + - - [12672, 16384, 1, 2304, 12672, 12672, 2304, 2304] + - [15, 22233.0] + - - [1024, 1024, 24, 96, 1024, 1024, 96, 96] + - [5, 18889.0] + - - [1728, 4096, 1, 2304, 1728, 1728, 2304, 2304] + - [15, 19764.0] + - - [2304, 4096, 1, 576, 2304, 2304, 576, 576] + - [19, 22006.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [7, 21387.0] + - - [12672, 4096, 1, 2304, 12672, 12672, 2304, 2304] + - [15, 22037.0] + - - [1024, 1024, 48, 96, 1024, 1024, 96, 96] + - [5, 19688.0] + - - [1728, 8192, 1, 2304, 1728, 1728, 2304, 2304] + - [7, 21013.0] + - - [2304, 8192, 1, 576, 2304, 2304, 576, 576] + - [19, 22177.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [7, 22050.0] + - - [12672, 8192, 1, 2304, 12672, 12672, 2304, 2304] + - [15, 21563.0] + - - [1024, 1024, 16, 96, 1024, 1024, 96, 96] + - [13, 18278.0] + - - [1152, 4096, 1, 3072, 1152, 1152, 3072, 3072] + - [51, 19579.0] + - - [3072, 4096, 1, 384, 3072, 3072, 384, 384] + - [5, 20723.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 3072, 3072] + - [15, 20508.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 1536, 1536] + - [15, 21281.0] + - - [6400, 4096, 1, 3072, 6400, 6400, 3072, 3072] + - [15, 21836.0] + - - [1024, 1024, 32, 96, 1024, 1024, 96, 96] + - [5, 19056.0] + - - [1152, 8192, 1, 3072, 1152, 1152, 3072, 3072] + - [51, 18289.0] + - - [3072, 8192, 1, 384, 3072, 3072, 384, 384] + - [7, 21372.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 3072, 3072] + - [34, 17915.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 1536, 1536] + - [15, 21674.0] + - - [6400, 8192, 1, 3072, 6400, 6400, 3072, 3072] + - [51, 21405.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [15, 20392.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 4096, 4096] + - [15, 20545.0] + - - [29000, 199, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 15280.0] + - - [29000, 221, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 16482.0] + - - [29000, 224, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 16893.0] + - - [29000, 229, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 17049.0] + - - [29000, 234, 1, 2048, 29000, 29000, 2048, 2048] + - [34, 17368.0] + - - [29000, 242, 1, 2048, 29000, 29000, 2048, 2048] + - [15, 17878.0] + - - [29000, 246, 1, 2048, 29000, 29000, 2048, 2048] + - [15, 18203.0] + - - [29000, 247, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 18329.0] + - - [29000, 256, 1, 2048, 29000, 29000, 2048, 2048] + - [51, 19544.0] + - - [29000, 262, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 13194.0] + - - [29000, 264, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 13366.0] + - - [29000, 265, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 13501.0] + - - [29000, 274, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 13822.0] + - - [29000, 277, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 13959.0] + - - [29000, 279, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 14148.0] + - - [29000, 288, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 14544.0] + - - [29000, 296, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 14804.0] + - - [29000, 315, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 15604.0] + - - [29000, 335, 1, 2048, 29000, 29000, 2048, 2048] + - [13, 16641.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 2048, 2048] + - [15, 20865.0] + - - [29000, 2283, 1, 1024, 29000, 29000, 1024, 1024] + - [51, 21147.0] + - - [29000, 2296, 1, 1024, 29000, 29000, 1024, 1024] + - [51, 21286.0] + - - [29000, 2306, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20009.0] + - - [29000, 2309, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20023.0] + - - [29000, 2318, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20122.0] + - - [29000, 2320, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20162.0] + - - [29000, 2324, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20211.0] + - - [29000, 2325, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20121.0] + - - [29000, 2329, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20232.0] + - - [29000, 2338, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20290.0] + - - [29000, 2345, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20491.0] + - - [29000, 2350, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20416.0] + - - [29000, 2362, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20454.0] + - - [29000, 2366, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20557.0] + - - [29000, 2368, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20556.0] + - - [29000, 2374, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20572.0] + - - [29000, 2390, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20731.0] + - - [512, 512, 320, 64, 512, 512, 64, 64] + - [13, 15968.0] + - - [29000, 561, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 18315.0] + - - [29000, 574, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 18689.0] + - - [29000, 600, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 19404.0] + - - [29000, 608, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 19671.0] + - - [29000, 615, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 19878.0] + - - [29000, 622, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 20031.0] + - - [29000, 625, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20179.0] + - - [29000, 626, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 20112.0] + - - [29000, 628, 1, 1024, 29000, 29000, 1024, 1024] + - [49, 20159.0] + - - [29000, 636, 1, 1024, 29000, 29000, 1024, 1024] + - [13, 20436.0] + - - [29000, 651, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 18072.0] + - - [29000, 658, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 18233.0] + - - [29000, 669, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 18494.0] + - - [29000, 670, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 18553.0] + - - [29000, 672, 1, 1024, 29000, 29000, 1024, 1024] + - [15, 18540.0] + - - [29000, 684, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 18885.0] + - - [29000, 716, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 19717.0] + - - [29000, 730, 1, 1024, 29000, 29000, 1024, 1024] + - [34, 20187.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [15, 19965.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 4096, 4096] + - [51, 20356.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 2560, 2560] + - [15, 19074.0] + - - [1024, 1024, 512, 64, 1024, 1024, 64, 64] + - [13, 17641.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 4096, 4096] + - [34, 18883.0] + - - [3072, 32768, 1, 1024, 3072, 3072, 1024, 1024] + - [15, 21395.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 1024, 1024] + - [15, 21326.0] + - - [50304, 32768, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 21599.0] + - - [1024, 1024, 24, 128, 1024, 1024, 128, 128] + - [5, 19594.0] + - - [128, 1024, 24, 1024, 128, 128, 1024, 1024] + - [15, 18831.0] + - - [128, 128, 49, 12800, 128, 128, 12800, 12800] + - [59, 13782.0] + - - [128, 128, 25, 25088, 128, 128, 25088, 25088] + - [60, 13426.0] + - - [128, 128, 49, 25600, 128, 128, 25600, 25600] + - [56, 13014.0] + - - [128, 128, 25, 50176, 128, 128, 50176, 50176] + - [57, 12367.0] + - - [128, 128, 36, 12544, 128, 128, 12544, 12544] + - [56, 13468.0] + - - [128, 128, 49, 9216, 128, 128, 9216, 9216] + - [59, 12812.0] + - - [1024, 1024, 1, 12544, 1024, 1024, 12544, 12544] + - [55, 19082.0] + - - [1024, 1000, 1, 12544, 1024, 1024, 12544, 12544] + - [55, 17804.0] + - - [128, 128, 36, 12000, 128, 128, 12000, 12000] + - [58, 16977.0] + - - [5888, 128, 1, 1, 5888, 5888, 1, 1] + - [64, 227.0] + - - [1856, 256, 1, 1, 1856, 1856, 1, 1] + - [76, 212.0] + - - [256, 1856, 1, 32, 256, 256, 32, 32] + - [94, 3456.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [80, 13321.0] + - - [2944, 128, 1, 32, 2944, 2944, 32, 32] + - [61, 3792.0] + - - [64, 6784, 1, 1, 64, 64, 1, 1] + - [61, 157.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [72, 11918.0] + - - [704, 1024, 1, 1, 704, 704, 1, 1] + - [87, 145.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [88, 13181.0] + - - [256, 1408, 1, 1, 256, 256, 1, 1] + - [61, 80.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [66, 13814.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [89, 13447.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [80, 11179.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [98, 15609.0] + - - [2944, 256, 1, 1, 2944, 2944, 1, 1] + - [62, 159.0] + - - [2944, 256, 1, 32, 2944, 2944, 32, 32] + - [78, 4047.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [104, 13895.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [104, 10240.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [98, 13756.0] + - - [2368, 256, 1, 32, 2368, 2368, 32, 32] + - [100, 3277.0] + - - [5056, 64, 1, 32, 5056, 5056, 32, 32] + - [61, 1842.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [79, 11198.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [97, 13432.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [81, 14197.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [88, 7632.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [80, 12111.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [88, 12839.0] + - - [1024, 448, 1, 32, 1024, 1024, 32, 32] + - [100, 4343.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [97, 11337.0] + - - [256, 1856, 1, 1, 256, 256, 1, 1] + - [62, 91.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [66, 16206.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [97, 14352.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [73, 11584.0] + - - [128, 5888, 1, 32, 128, 128, 32, 32] + - [97, 4962.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [73, 13455.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [81, 12520.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [73, 8590.0] + - - [1024, 704, 1, 1, 1024, 1024, 1, 1] + - [85, 141.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [96, 11934.0] + - - [1024, 704, 1, 32, 1024, 1024, 32, 32] + - [94, 3845.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [98, 11716.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [80, 13919.0] + - - [2944, 448, 1, 1, 2944, 2944, 1, 1] + - [101, 191.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [88, 10992.0] + - - [704, 704, 1, 32, 704, 704, 32, 32] + - [61, 2661.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [80, 12020.0] + - - [5056, 128, 1, 1, 5056, 5056, 1, 1] + - [87, 129.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [96, 10754.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [97, 13338.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [98, 11204.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [81, 14851.0] + - - [128, 6784, 1, 32, 128, 128, 32, 32] + - [70, 4439.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [104, 15258.0] + - - [4288, 128, 1, 1, 4288, 4288, 1, 1] + - [88, 120.0] + - - [256, 2368, 1, 32, 256, 256, 32, 32] + - [93, 3288.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [72, 8329.0] + - - [256, 1408, 1, 32, 256, 256, 32, 32] + - [92, 2958.0] + - - [256, 3584, 1, 32, 256, 256, 32, 32] + - [61, 4616.0] + - - [128, 4288, 1, 32, 128, 128, 32, 32] + - [69, 2806.0] + - - [448, 1856, 1, 1, 448, 448, 1, 1] + - [69, 158.0] + - - [448, 1856, 1, 32, 448, 448, 32, 32] + - [61, 4264.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [66, 14617.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [80, 8979.0] + - - [704, 1856, 1, 32, 704, 704, 32, 32] + - [61, 5162.0] + - - [704, 1408, 1, 32, 704, 704, 32, 32] + - [61, 4850.0] + - - [5888, 128, 1, 32, 5888, 5888, 32, 32] + - [61, 3816.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [74, 12991.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [81, 14186.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [71, 8652.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [81, 14195.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [96, 11212.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [103, 13524.0] + - - [128, 2368, 1, 1, 128, 128, 1, 1] + - [83, 64.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [64, 11145.0] + - - [1024, 448, 1, 1, 1024, 1024, 1, 1] + - [85, 98.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [104, 11567.0] + - - [704, 448, 1, 32, 704, 704, 32, 32] + - [93, 1835.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [73, 10019.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [103, 14240.0] + - - [256, 2368, 1, 1, 256, 256, 1, 1] + - [61, 126.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [96, 11459.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [64, 11715.0] + - - [4288, 256, 1, 1, 4288, 4288, 1, 1] + - [61, 193.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [74, 14685.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [81, 14193.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [66, 13741.0] + - - [3584, 256, 1, 1, 3584, 3584, 1, 1] + - [81, 174.0] + - - [2368, 448, 1, 32, 2368, 2368, 32, 32] + - [84, 5067.0] + - - [1408, 704, 1, 1, 1408, 1408, 1, 1] + - [101, 185.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [71, 10183.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [73, 12923.0] + - - [256, 2944, 1, 1, 256, 256, 1, 1] + - [86, 253.0] + - - [6784, 64, 1, 1, 6784, 6784, 1, 1] + - [88, 158.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [73, 8446.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [66, 11040.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [74, 12893.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [72, 7032.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [81, 14281.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [66, 15498.0] + - - [5888, 64, 1, 1, 5888, 5888, 1, 1] + - [61, 81.0] + - - [256, 3584, 1, 1, 256, 256, 1, 1] + - [65, 172.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [66, 13768.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [66, 11896.0] + - - [128, 5056, 1, 32, 128, 128, 32, 32] + - [100, 3287.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [104, 15250.0] + - - [5888, 64, 1, 32, 5888, 5888, 32, 32] + - [94, 2250.0] + - - [2368, 256, 1, 1, 2368, 2368, 1, 1] + - [79, 128.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [103, 10545.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [87, 7833.0] + - - [5056, 64, 1, 1, 5056, 5056, 1, 1] + - [69, 72.0] + - - [1408, 448, 1, 32, 1408, 1408, 32, 32] + - [84, 3456.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [74, 15284.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [81, 11687.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [74, 11850.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [98, 12349.0] + - - [448, 704, 1, 1, 448, 448, 1, 1] + - [69, 69.0] + - - [448, 704, 1, 32, 448, 448, 32, 32] + - [61, 1848.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [79, 11722.0] + - - [64, 5888, 1, 1, 64, 64, 1, 1] + - [61, 74.0] + - - [2368, 128, 1, 32, 2368, 2368, 32, 32] + - [69, 1816.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [74, 14306.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [74, 13131.0] + - - [4288, 256, 1, 32, 4288, 4288, 32, 32] + - [76, 5166.0] + - - [448, 1408, 1, 1, 448, 448, 1, 1] + - [61, 127.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [80, 7382.0] + - - [256, 4288, 1, 32, 256, 256, 32, 32] + - [101, 5274.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [97, 10189.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [96, 9544.0] + - - [128, 2944, 1, 32, 128, 128, 32, 32] + - [78, 2030.0] + - - [1856, 448, 1, 1, 1856, 1856, 1, 1] + - [76, 168.0] + - - [704, 704, 1, 1, 704, 704, 1, 1] + - [69, 105.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [81, 13909.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [97, 10900.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [81, 16194.0] + - - [448, 2368, 1, 1, 448, 448, 1, 1] + - [98, 191.0] + - - [128, 6784, 1, 1, 128, 128, 1, 1] + - [92, 210.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [65, 14015.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [72, 7146.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [72, 8986.0] + - - [448, 1024, 1, 32, 448, 448, 32, 32] + - [93, 2505.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [73, 10444.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [79, 10737.0] + - - [704, 1856, 1, 1, 704, 704, 1, 1] + - [69, 189.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [102, 7300.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [98, 13705.0] + - - [128, 5056, 1, 1, 128, 128, 1, 1] + - [64, 127.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [74, 14353.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [72, 7794.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [73, 9076.0] + - - [704, 1408, 1, 1, 704, 704, 1, 1] + - [61, 185.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [74, 14006.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [89, 14663.0] + - - [3584, 256, 1, 32, 3584, 3584, 32, 32] + - [95, 4646.0] + - - [1408, 256, 1, 32, 1408, 1408, 32, 32] + - [73, 3117.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [80, 10767.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [81, 16411.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [73, 13428.0] + - - [3584, 128, 1, 1, 3584, 3584, 1, 1] + - [61, 94.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [80, 7362.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [66, 10622.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [103, 8979.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [97, 12694.0] + - - [128, 5888, 1, 1, 128, 128, 1, 1] + - [97, 144.0] + - - [64, 5056, 1, 1, 64, 64, 1, 1] + - [61, 64.0] + - - [1856, 256, 1, 32, 1856, 1856, 32, 32] + - [93, 4607.0] + - - [64, 5056, 1, 32, 64, 64, 32, 32] + - [100, 2784.0] + - - [1408, 704, 1, 32, 1408, 1408, 32, 32] + - [100, 7516.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [98, 13013.0] + - - [1024, 1024, 1, 32, 1024, 1024, 32, 32] + - [77, 4920.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [98, 16371.0] + - - [128, 4288, 1, 1, 128, 128, 1, 1] + - [87, 104.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [73, 11351.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [103, 13825.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [73, 9205.0] + - - [6784, 128, 1, 1, 6784, 6784, 1, 1] + - [69, 284.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [73, 10917.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [97, 10731.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [89, 11774.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [73, 11967.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [81, 13813.0] + - - [704, 448, 1, 1, 704, 704, 1, 1] + - [87, 67.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [80, 10957.0] + - - [2944, 128, 1, 1, 2944, 2944, 1, 1] + - [76, 165.0] + - - [704, 1024, 1, 32, 704, 704, 32, 32] + - [72, 5915.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [81, 16053.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [73, 13214.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [88, 11948.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [65, 14083.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [97, 14204.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [97, 13133.0] + - - [448, 2944, 1, 1, 448, 448, 1, 1] + - [100, 154.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [74, 13931.0] + - - [2368, 448, 1, 1, 2368, 2368, 1, 1] + - [69, 203.0] + - - [448, 2944, 1, 32, 448, 448, 32, 32] + - [76, 5342.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [81, 13863.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [104, 11540.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [66, 14584.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [74, 11445.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [65, 13624.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [81, 14823.0] + - - [1856, 704, 1, 32, 1856, 1856, 32, 32] + - [102, 5374.0] + - - [2944, 448, 1, 32, 2944, 2944, 32, 32] + - [85, 5397.0] + - - [5056, 128, 1, 32, 5056, 5056, 32, 32] + - [100, 3395.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [89, 15429.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [66, 13600.0] + - - [1856, 704, 1, 1, 1856, 1856, 1, 1] + - [61, 161.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [97, 14013.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [81, 13049.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [65, 9256.0] + - - [6784, 128, 1, 32, 6784, 6784, 32, 32] + - [89, 6230.0] + - - [1408, 448, 1, 1, 1408, 1408, 1, 1] + - [82, 182.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [102, 10719.0] + - - [448, 1408, 1, 32, 448, 448, 32, 32] + - [84, 3456.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [88, 12943.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1, 1] + - [80, 179.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [73, 10685.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [72, 10440.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [88, 13136.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [98, 14317.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [64, 11651.0] + - - [2368, 128, 1, 1, 2368, 2368, 1, 1] + - [69, 54.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [72, 7331.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [87, 6879.0] + - - [64, 6784, 1, 32, 64, 64, 32, 32] + - [100, 2227.0] + - - [256, 4288, 1, 1, 256, 256, 1, 1] + - [94, 190.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [65, 7608.0] + - - [3584, 128, 1, 32, 3584, 3584, 32, 32] + - [98, 3707.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [88, 13535.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [104, 12755.0] + - - [128, 2944, 1, 1, 128, 128, 1, 1] + - [78, 80.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [66, 13061.0] + - - [6784, 64, 1, 32, 6784, 6784, 32, 32] + - [93, 2412.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [87, 10450.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [66, 10919.0] + - - [4288, 128, 1, 32, 4288, 4288, 32, 32] + - [69, 3007.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [72, 7158.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [73, 13934.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [64, 11772.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [74, 11878.0] + - - [256, 2944, 1, 32, 256, 256, 32, 32] + - [98, 5558.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [80, 12668.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [81, 16097.0] + - - [128, 2368, 1, 32, 128, 128, 32, 32] + - [62, 2679.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [89, 12690.0] + - - [1856, 448, 1, 32, 1856, 1856, 32, 32] + - [79, 6490.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [97, 10610.0] + - - [448, 1024, 1, 1, 448, 448, 1, 1] + - [85, 108.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [80, 10486.0] + - - [64, 5888, 1, 32, 64, 64, 32, 32] + - [84, 1926.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [81, 10675.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [80, 13934.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [66, 15521.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [72, 9647.0] + - - [1408, 256, 1, 1, 1408, 1408, 1, 1] + - [61, 77.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [98, 11920.0] + - - [128, 3584, 1, 1, 128, 128, 1, 1] + - [104, 88.0] + - - [128, 3584, 1, 32, 128, 128, 32, 32] + - [84, 2338.0] + - - [448, 2368, 1, 32, 448, 448, 32, 32] + - [69, 4836.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [98, 14303.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [97, 13317.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [88, 11149.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [104, 13526.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [88, 11334.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [89, 13450.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [103, 10346.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [73, 13192.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [89, 17004.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [89, 13537.0] + - - [2049, 512, 1, 2048, 2049, 2049, 2048, 2048] + - [89, 13869.0] + - - [1023, 512, 1, 1024, 1023, 1023, 1024, 1024] + - [104, 11830.0] + - - [1024, 512, 1, 1025, 1024, 1024, 1025, 1025] + - [98, 12451.0] + - - [1024, 1024, 1, 1023, 1024, 1024, 1023, 1023] + - [66, 13658.0] + - - [1024, 1025, 1, 1024, 1024, 1024, 1024, 1024] + - [74, 13375.0] + - - [1024, 1023, 1, 1024, 1024, 1024, 1024, 1024] + - [74, 13172.0] + - - [2048, 511, 1, 2048, 2048, 2048, 2048, 2048] + - [74, 13683.0] + - - [2047, 512, 1, 2048, 2047, 2047, 2048, 2048] + - [74, 13726.0] + - - [1025, 1024, 1, 1024, 1025, 1025, 1024, 1024] + - [74, 13455.0] + - - [1024, 1024, 1, 1025, 1024, 1024, 1025, 1025] + - [66, 13636.0] + - - [1025, 512, 1, 1024, 1025, 1025, 1024, 1024] + - [74, 12457.0] + - - [1024, 512, 1, 1023, 1024, 1024, 1023, 1023] + - [81, 12520.0] + - - [2048, 513, 1, 2048, 2048, 2048, 2048, 2048] + - [74, 13747.0] + - - [1024, 511, 1, 1024, 1024, 1024, 1024, 1024] + - [89, 11598.0] + - - [2048, 512, 1, 2047, 2048, 2048, 2047, 2047] + - [81, 14146.0] + - - [1024, 513, 1, 1024, 1024, 1024, 1024, 1024] + - [74, 11964.0] + - - [2048, 512, 1, 2049, 2048, 2048, 2049, 2049] + - [104, 14225.0] + - - [1023, 1024, 1, 1024, 1023, 1023, 1024, 1024] + - [104, 13585.0] + - - [64, 128, 512, 128, 64, 64, 128, 128] + - [67, 8781.0] + - - [64, 512, 64, 512, 64, 64, 512, 512] + - [105, 9095.0] + - - [256, 1280, 1, 1024, 256, 256, 1024, 1024] + - [74, 11941.0] + - - [256, 1536, 1, 1024, 256, 256, 1024, 1024] + - [73, 10607.0] + - - [256, 2304, 1, 1024, 256, 256, 1024, 1024] + - [74, 13789.0] + - - [256, 2560, 1, 1024, 256, 256, 1024, 1024] + - [74, 15322.0] + - - [256, 2816, 1, 1024, 256, 256, 1024, 1024] + - [73, 12436.0] + - - [256, 3328, 1, 1024, 256, 256, 1024, 1024] + - [104, 14195.0] + - - [256, 3584, 1, 1024, 256, 256, 1024, 1024] + - [74, 15198.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [103, 13010.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [74, 12561.0] + - - [1024, 512, 1, 1600, 1024, 1024, 1600, 1600] + - [81, 13303.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [73, 14150.0] + - - [1024, 960, 1, 1600, 1024, 1024, 1600, 1600] + - [80, 15037.0] + - - [2048, 215, 1, 512, 2048, 2048, 512, 512] + - [89, 8820.0] + - - [2048, 215, 1, 768, 2048, 2048, 768, 768] + - [81, 9830.0] + - - [2048, 256, 1, 512, 2048, 2048, 512, 512] + - [74, 11279.0] + - - [2048, 256, 1, 768, 2048, 2048, 768, 768] + - [89, 12121.0] + - - [2048, 512, 1, 67, 2048, 2048, 67, 67] + - [61, 7273.0] + - - [2048, 512, 1, 74, 2048, 2048, 74, 74] + - [85, 9724.0] + - - [2048, 512, 1, 100, 2048, 2048, 100, 100] + - [93, 8841.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [74, 13867.0] + - - [1024, 512, 1, 4096, 1024, 1024, 4096, 4096] + - [74, 13427.0] + - - [30522, 77, 1, 1024, 30522, 30522, 1024, 1024] + - [104, 10397.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [89, 13070.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [74, 13392.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [89, 13749.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [88, 10389.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [104, 11383.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 4096, 4096] + - [74, 13693.0] + - - [480, 1024, 1, 1024, 480, 480, 1024, 1024] + - [74, 11491.0] + - - [480, 2048, 1, 2048, 480, 480, 2048, 2048] + - [74, 12987.0] + - - [1024, 480, 1, 1024, 1024, 1024, 1024, 1024] + - [89, 11165.0] + - - [2048, 480, 1, 2048, 2048, 2048, 2048, 2048] + - [89, 12949.0] + - - [64, 1024, 256, 1024, 64, 64, 1024, 1024] + - [75, 9993.0] + - - [64, 512, 40, 512, 64, 64, 512, 512] + - [105, 8910.0] + - - [96, 1024, 64, 1024, 96, 96, 1024, 1024] + - [104, 13172.0] + - - [64, 1024, 128, 1024, 64, 64, 1024, 1024] + - [75, 9904.0] + - - [64, 1024, 32, 1024, 64, 64, 1024, 1024] + - [90, 9196.0] + - - [64, 512, 128, 512, 64, 64, 512, 512] + - [90, 9683.0] + - - [96, 1024, 128, 1024, 96, 96, 1024, 1024] + - [89, 13432.0] + - - [64, 512, 256, 512, 64, 64, 512, 512] + - [90, 9710.0] + - - [64, 1024, 64, 1024, 64, 64, 1024, 1024] + - [75, 9587.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [74, 13228.0] + - - [64, 128, 1024, 128, 64, 64, 128, 128] + - [96, 9701.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [89, 14990.0] + - - [1024, 864, 1, 480, 1024, 1024, 480, 480] + - [98, 14386.0] + - - [128, 3456, 1, 256, 128, 128, 256, 256] + - [73, 10847.0] + - - [128, 4096, 1, 256, 128, 128, 256, 256] + - [74, 11355.0] + - - [128, 6912, 1, 256, 128, 128, 256, 256] + - [74, 11896.0] + - - [256, 3456, 1, 512, 256, 256, 512, 512] + - [104, 13777.0] + - - [256, 4096, 1, 512, 256, 256, 512, 512] + - [74, 12955.0] + - - [512, 864, 1, 1024, 512, 512, 1024, 1024] + - [73, 11382.0] + - - [512, 864, 1, 13, 512, 512, 13, 13] + - [93, 1203.0] + - - [64, 128, 1280, 128, 64, 64, 128, 128] + - [82, 9108.0] + - - [64, 128, 1312, 128, 64, 64, 128, 128] + - [90, 9127.0] + - - [64, 512, 192, 512, 64, 64, 512, 512] + - [75, 9690.0] + - - [1024, 512, 1, 196, 1024, 1024, 196, 196] + - [94, 10235.0] + - - [2048, 512, 1, 49, 2048, 2048, 49, 49] + - [93, 9515.0] + - - [2304, 256, 1, 196, 2304, 2304, 196, 196] + - [77, 12484.0] + - - [512, 1024, 1, 196, 512, 512, 196, 196] + - [62, 10840.0] + - - [512, 2048, 1, 49, 512, 512, 49, 49] + - [100, 8983.0] + - - [64, 128, 2048, 128, 64, 64, 128, 128] + - [105, 9441.0] + - - [64, 128, 1536, 128, 64, 64, 128, 128] + - [96, 10433.0] + - - [128, 128, 64, 6400, 128, 128, 6400, 6400] + - [92, 11265.0] + - - [64, 128, 192, 128, 64, 64, 128, 128] + - [75, 7655.0] + - - [64, 384, 144, 384, 64, 64, 384, 384] + - [79, 14050.0] + - - [64, 512, 48, 512, 64, 64, 512, 512] + - [75, 9000.0] + - - [64, 128, 256, 128, 64, 64, 128, 128] + - [82, 7877.0] + - - [64, 384, 192, 384, 64, 64, 384, 384] + - [79, 10778.0] + - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] + - [74, 12937.0] + - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] + - [74, 13819.0] + - - [128, 128, 49, 1120, 128, 128, 1120, 1120] + - [80, 14354.0] + - - [128, 128, 49, 1064, 128, 128, 1064, 1064] + - [62, 14434.0] + - - [128, 128, 49, 1040, 128, 128, 1040, 1040] + - [65, 14311.0] + - - [128, 128, 64, 600, 128, 128, 600, 600] + - [97, 12999.0] + - - [128, 128, 64, 616, 128, 128, 616, 616] + - [104, 13044.0] + - - [128, 128, 49, 950, 128, 128, 950, 950] + - [80, 14331.0] + - - [128, 128, 49, 972, 128, 128, 972, 972] + - [65, 14580.0] + - - [128, 128, 64, 560, 128, 128, 560, 560] + - [89, 13921.0] + - - [128, 128, 49, 1008, 128, 128, 1008, 1008] + - [65, 14789.0] + - - [128, 128, 64, 532, 128, 128, 532, 532] + - [89, 13613.0] + - - [128, 128, 49, 1080, 128, 128, 1080, 1080] + - [80, 14686.0] + - - [128, 128, 64, 588, 128, 128, 588, 588] + - [104, 13683.0] + - - [128, 128, 49, 1160, 128, 128, 1160, 1160] + - [65, 14721.0] + - - [128, 128, 49, 988, 128, 128, 988, 988] + - [97, 14656.0] + - - [128, 128, 49, 936, 128, 128, 936, 936] + - [65, 14523.0] + - - [512, 1024, 1, 3800, 512, 512, 3800, 3800] + - [66, 13975.0] + - - [512, 1024, 1, 3400, 512, 512, 3400, 3400] + - [81, 13926.0] + - - [512, 1024, 1, 3456, 512, 512, 3456, 3456] + - [66, 13992.0] + - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] + - [74, 13146.0] + - - [2048, 512, 1, 950, 2048, 2048, 950, 950] + - [77, 14247.0] + - - [512, 1024, 1, 3552, 512, 512, 3552, 3552] + - [66, 13791.0] + - - [512, 1024, 1, 3220, 512, 512, 3220, 3220] + - [66, 13643.0] + - - [2048, 512, 1, 850, 2048, 2048, 850, 850] + - [65, 13784.0] + - - [512, 2048, 1, 864, 512, 512, 864, 864] + - [66, 13702.0] + - - [512, 2048, 1, 768, 512, 512, 768, 768] + - [98, 14025.0] + - - [2048, 512, 1, 805, 2048, 2048, 805, 805] + - [77, 14064.0] + - - [512, 1024, 1, 2852, 512, 512, 2852, 2852] + - [81, 13559.0] + - - [512, 2048, 1, 888, 512, 512, 888, 888] + - [94, 14932.0] + - - [2048, 512, 1, 864, 2048, 2048, 864, 864] + - [81, 14120.0] + - - [2048, 512, 1, 768, 2048, 2048, 768, 768] + - [66, 13489.0] + - - [2048, 512, 1, 888, 2048, 2048, 888, 888] + - [77, 14535.0] + - - [2048, 256, 1, 950, 2048, 2048, 950, 950] + - [66, 13163.0] + - - [2048, 512, 1, 713, 2048, 2048, 713, 713] + - [77, 13549.0] + - - [512, 1024, 1, 2688, 512, 512, 2688, 2688] + - [74, 13811.0] + - - [512, 1024, 1, 2640, 512, 512, 2640, 2640] + - [98, 13586.0] + - - [512, 1024, 1, 2904, 512, 512, 2904, 2904] + - [66, 13587.0] + - - [1024, 512, 1, 950, 1024, 1024, 950, 950] + - [98, 12571.0] + - - [512, 2048, 1, 672, 512, 512, 672, 672] + - [94, 14428.0] + - - [512, 2048, 1, 660, 512, 512, 660, 660] + - [94, 14188.0] + - - [512, 2048, 1, 1008, 512, 512, 1008, 1008] + - [77, 15143.0] + - - [2048, 256, 1, 850, 2048, 2048, 850, 850] + - [98, 12263.0] + - - [2048, 512, 1, 726, 2048, 2048, 726, 726] + - [77, 13989.0] + - - [1024, 512, 1, 850, 1024, 1024, 850, 850] + - [81, 12358.0] + - - [2048, 512, 1, 660, 2048, 2048, 660, 660] + - [77, 13902.0] + - - [2048, 512, 1, 672, 2048, 2048, 672, 672] + - [77, 14247.0] + - - [512, 2048, 1, 840, 512, 512, 840, 840] + - [94, 14764.0] + - - [2048, 512, 1, 1008, 2048, 2048, 1008, 1008] + - [94, 14424.0] + - - [512, 2048, 1, 792, 512, 512, 792, 792] + - [66, 13443.0] + - - [1024, 512, 1, 805, 1024, 1024, 805, 805] + - [81, 12121.0] + - - [512, 2048, 1, 1050, 512, 512, 1050, 1050] + - [62, 13937.0] + - - [2048, 512, 1, 748, 2048, 2048, 748, 748] + - [74, 13394.0] + - - [2048, 256, 1, 864, 2048, 2048, 864, 864] + - [98, 12431.0] + - - [1024, 512, 1, 768, 1024, 1024, 768, 768] + - [74, 12143.0] + - - [1024, 512, 1, 864, 1024, 1024, 864, 864] + - [93, 12689.0] + - - [2048, 512, 1, 875, 2048, 2048, 875, 875] + - [77, 14059.0] + - - [2048, 512, 1, 840, 2048, 2048, 840, 840] + - [62, 14369.0] + - - [2048, 512, 1, 792, 2048, 2048, 792, 792] + - [66, 13513.0] + - - [512, 2048, 1, 736, 512, 512, 736, 736] + - [66, 13544.0] + - - [2048, 256, 1, 888, 2048, 2048, 888, 888] + - [61, 13363.0] + - - [512, 2048, 1, 704, 512, 512, 704, 704] + - [94, 14705.0] + - - [512, 2048, 1, 588, 512, 512, 588, 588] + - [94, 14116.0] + - - [1024, 512, 1, 888, 1024, 1024, 888, 888] + - [93, 13034.0] + - - [512, 2048, 1, 816, 512, 512, 816, 816] + - [94, 14656.0] + - - [1024, 512, 1, 713, 1024, 1024, 713, 713] + - [74, 12784.0] + - - [2048, 512, 1, 736, 2048, 2048, 736, 736] + - [63, 13961.0] + - - [2048, 512, 1, 588, 2048, 2048, 588, 588] + - [77, 13439.0] + - - [2048, 512, 1, 704, 2048, 2048, 704, 704] + - [77, 14234.0] + - - [1024, 512, 1, 660, 1024, 1024, 660, 660] + - [62, 11883.0] + - - [2048, 256, 1, 660, 2048, 2048, 660, 660] + - [63, 11867.0] + - - [2048, 256, 1, 672, 2048, 2048, 672, 672] + - [61, 12674.0] + - - [1024, 512, 1, 672, 1024, 1024, 672, 672] + - [77, 12242.0] + - - [1024, 512, 1, 726, 1024, 1024, 726, 726] + - [77, 12000.0] + - - [512, 2048, 1, 630, 512, 512, 630, 630] + - [76, 14973.0] + - - [512, 2048, 1, 600, 512, 512, 600, 600] + - [76, 14202.0] + - - [2048, 256, 1, 805, 2048, 2048, 805, 805] + - [102, 12561.0] + - - [2048, 256, 1, 713, 2048, 2048, 713, 713] + - [81, 12917.0] + - - [2048, 256, 1, 726, 2048, 2048, 726, 726] + - [61, 13555.0] + - - [320, 1024, 1, 1024, 320, 320, 1024, 1024] + - [102, 10565.0] + - - [1024, 1000, 1, 1024, 1024, 1024, 1024, 1024] + - [74, 13382.0] + - - [320, 1000, 1, 1024, 320, 320, 1024, 1024] + - [87, 10101.0] + - - [128, 128, 49, 1280, 128, 128, 1280, 1280] + - [89, 13669.0] + - - [128, 128, 49, 1360, 128, 128, 1360, 1360] + - [77, 14250.0] + - - [128, 128, 49, 1200, 128, 128, 1200, 1200] + - [65, 14383.0] + - - [128, 128, 49, 1240, 128, 128, 1240, 1240] + - [80, 14185.0] + - - [2304, 256, 1, 704, 2304, 2304, 704, 704] + - [77, 13615.0] + - - [2304, 256, 1, 736, 2304, 2304, 736, 736] + - [81, 13617.0] + - - [2304, 256, 1, 792, 2304, 2304, 792, 792] + - [63, 13691.0] + - - [2304, 256, 1, 748, 2304, 2304, 748, 748] + - [63, 13617.0] + - - [2304, 256, 1, 726, 2304, 2304, 726, 726] + - [81, 13415.0] + - - [2304, 256, 1, 713, 2304, 2304, 713, 713] + - [63, 13402.0] + - - [2304, 256, 1, 768, 2304, 2304, 768, 768] + - [81, 13522.0] + - - [512, 2048, 1, 759, 512, 512, 759, 759] + - [77, 14273.0] + - - [512, 2048, 1, 925, 512, 512, 925, 925] + - [94, 14255.0] + - - [2304, 256, 1, 805, 2304, 2304, 805, 805] + - [63, 13636.0] + - - [512, 2048, 1, 900, 512, 512, 900, 900] + - [94, 13977.0] + - - [512, 2048, 1, 875, 512, 512, 875, 875] + - [77, 14124.0] + - - [512, 2048, 1, 748, 512, 512, 748, 748] + - [81, 13389.0] + - - [512, 2048, 1, 726, 512, 512, 726, 726] + - [94, 14288.0] + - - [512, 2048, 1, 713, 512, 512, 713, 713] + - [77, 14053.0] + - - [512, 2048, 1, 805, 512, 512, 805, 805] + - [77, 13874.0] + - - [512, 2048, 1, 850, 512, 512, 850, 850] + - [77, 14366.0] + - - [512, 2048, 1, 950, 512, 512, 950, 950] + - [77, 14251.0] + - - [128, 128, 49, 1152, 128, 128, 1152, 1152] + - [97, 14198.0] + - - [128, 128, 49, 1216, 128, 128, 1216, 1216] + - [97, 14189.0] + - - [128, 128, 36, 1800, 128, 128, 1800, 1800] + - [66, 14596.0] + - - [128, 128, 36, 1900, 128, 128, 1900, 1900] + - [81, 14630.0] + - - [128, 128, 64, 5880, 128, 128, 5880, 5880] + - [74, 14117.0] + - - [128, 128, 49, 7680, 128, 128, 7680, 7680] + - [92, 11103.0] + - - [128, 128, 64, 882, 128, 128, 882, 882] + - [98, 13962.0] + - - [128, 128, 64, 931, 128, 128, 931, 931] + - [66, 14030.0] + - - [128, 64, 121, 1152, 128, 128, 1152, 1152] + - [88, 11897.0] + - - [128, 64, 81, 12000, 128, 128, 12000, 12000] + - [73, 11125.0] + - - [128, 64, 121, 1216, 128, 128, 1216, 1216] + - [97, 13331.0] + - - [128, 64, 81, 1800, 128, 128, 1800, 1800] + - [64, 12567.0] + - - [128, 64, 81, 1900, 128, 128, 1900, 1900] + - [79, 12565.0] + - - [128, 64, 49, 20280, 128, 128, 20280, 20280] + - [97, 10964.0] + - - [128, 64, 49, 3042, 128, 128, 3042, 3042] + - [65, 12114.0] + - - [128, 64, 49, 3211, 128, 128, 3211, 3211] + - [88, 12112.0] + - - [128, 64, 169, 5880, 128, 128, 5880, 5880] + - [106, 10662.0] + - - [128, 64, 121, 7680, 128, 128, 7680, 7680] + - [91, 8921.0] + - - [128, 64, 169, 882, 128, 128, 882, 882] + - [68, 10222.0] + - - [128, 64, 169, 931, 128, 128, 931, 931] + - [99, 10178.0] + - - [256, 128, 25, 1080, 256, 256, 1080, 1080] + - [65, 15011.0] + - - [256, 128, 25, 162, 256, 256, 162, 162] + - [65, 9165.0] + - - [256, 128, 25, 171, 256, 256, 171, 171] + - [80, 9608.0] + - - [1152, 256, 1, 1, 1152, 1152, 1, 1] + - [65, 64.0] + - - [1152, 256, 1, 1444, 1152, 1152, 1444, 1444] + - [81, 11889.0] + - - [1152, 256, 1, 25, 1152, 1152, 25, 25] + - [86, 1321.0] + - - [1152, 256, 1, 9, 1152, 1152, 9, 9] + - [98, 553.0] + - - [2304, 256, 1, 1444, 2304, 2304, 1444, 1444] + - [63, 14700.0] + - - [2304, 340, 1, 1, 2304, 2304, 1, 1] + - [86, 98.0] + - - [2304, 340, 1, 1444, 2304, 2304, 1444, 1444] + - [81, 13292.0] + - - [2304, 340, 1, 9, 2304, 2304, 9, 9] + - [100, 1183.0] + - - [2304, 510, 1, 25, 2304, 2304, 25, 25] + - [69, 3448.0] + - - [96, 1024, 160, 1024, 96, 96, 1024, 1024] + - [74, 13435.0] + - - [96, 1024, 40, 1024, 96, 96, 1024, 1024] + - [74, 13028.0] + - - [96, 1024, 80, 1024, 96, 96, 1024, 1024] + - [74, 13133.0] + - - [96, 1024, 96, 1024, 96, 96, 1024, 1024] + - [89, 13295.0] + - - [96, 1024, 24, 1024, 96, 96, 1024, 1024] + - [74, 12914.0] + - - [96, 1024, 48, 1024, 96, 96, 1024, 1024] + - [74, 12963.0] + - - [96, 1024, 16, 1024, 96, 96, 1024, 1024] + - [74, 12056.0] + - - [96, 1024, 32, 1024, 96, 96, 1024, 1024] + - [74, 13152.0] + - - [64, 512, 320, 512, 64, 64, 512, 512] + - [75, 9790.0] + - - [64, 512, 80, 512, 64, 64, 512, 512] + - [90, 9297.0] + - - [29000, 109, 1, 2560, 29000, 29000, 2560, 2560] + - [89, 14492.0] + - - [29000, 121, 1, 2560, 29000, 29000, 2560, 2560] + - [89, 15916.0] + - - [29000, 65, 1, 2560, 29000, 29000, 2560, 2560] + - [104, 8790.0] + - - [29000, 66, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 8929.0] + - - [29000, 67, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 9047.0] + - - [29000, 69, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 9316.0] + - - [29000, 70, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 9444.0] + - - [29000, 71, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 9581.0] + - - [29000, 73, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 9830.0] + - - [29000, 74, 1, 2560, 29000, 29000, 2560, 2560] + - [104, 9952.0] + - - [29000, 75, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 10116.0] + - - [29000, 77, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 10375.0] + - - [29000, 78, 1, 2560, 29000, 29000, 2560, 2560] + - [104, 10508.0] + - - [29000, 80, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 10780.0] + - - [29000, 81, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 10919.0] + - - [29000, 82, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 11020.0] + - - [29000, 83, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 11172.0] + - - [29000, 84, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 11300.0] + - - [29000, 88, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 11839.0] + - - [29000, 89, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 11965.0] + - - [29000, 90, 1, 2560, 29000, 29000, 2560, 2560] + - [104, 12069.0] + - - [29000, 92, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 12328.0] + - - [29000, 95, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 12676.0] + - - [29000, 98, 1, 2560, 29000, 29000, 2560, 2560] + - [74, 13075.0] + - - [64, 1024, 512, 1024, 64, 64, 1024, 1024] + - [105, 10015.0] + - - [64, 64, 36, 50176, 64, 64, 50176, 50176] + - [113, 6614.0] + - - [64, 64, 49, 36864, 64, 64, 36864, 36864] + - [113, 6189.0] + - - [64, 64, 64, 25600, 64, 64, 25600, 25600] + - [105, 7681.0] + - - [256, 256, 1, 60800, 256, 256, 60800, 60800] + - [107, 12482.0] + - - [256, 256, 1, 54400, 256, 256, 54400, 54400] + - [110, 12388.0] + - - [256, 256, 1, 51520, 256, 256, 51520, 51520] + - [116, 12330.0] + - - [256, 256, 1, 55296, 256, 256, 55296, 55296] + - [114, 9748.0] + - - [256, 256, 1, 56832, 256, 256, 56832, 56832] + - [119, 11089.0] + - - [256, 256, 1, 45632, 256, 256, 45632, 45632] + - [116, 12259.0] + - - [256, 256, 1, 49152, 256, 256, 49152, 49152] + - [121, 9704.0] + - - [256, 512, 1, 13600, 256, 256, 13600, 13600] + - [107, 12767.0] + - - [256, 256, 1, 43008, 256, 256, 43008, 43008] + - [117, 9621.0] + - - [256, 512, 1, 15200, 256, 256, 15200, 15200] + - [107, 13025.0] + - - [256, 512, 1, 12880, 256, 256, 12880, 12880] + - [107, 12602.0] + - - [256, 512, 1, 13824, 256, 256, 13824, 13824] + - [120, 11908.0] + - - [512, 256, 1, 13824, 512, 512, 13824, 13824] + - [108, 12102.0] + - - [256, 512, 1, 14208, 256, 256, 14208, 14208] + - [107, 12679.0] + - - [512, 256, 1, 14208, 512, 512, 14208, 14208] + - [116, 12761.0] + - - [512, 256, 1, 15200, 512, 512, 15200, 15200] + - [110, 12902.0] + - - [256, 512, 1, 12288, 256, 256, 12288, 12288] + - [112, 11378.0] + - - [512, 256, 1, 12288, 512, 512, 12288, 12288] + - [120, 10897.0] + - - [128, 64, 25, 43320, 128, 128, 43320, 43320] + - [109, 12722.0] + - - [64, 64, 64, 20280, 64, 64, 20280, 20280] + - [111, 9276.0] + - - [64, 64, 49, 27000, 64, 64, 27000, 27000] + - [118, 9212.0] + - - [64, 64, 36, 43320, 64, 64, 43320, 43320] + - [115, 9277.0] + - - [32, 5056, 1, 1280, 32, 32, 1280, 1280] + - [147, 4588.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [193, 5778.0] + - - [2368, 64, 1, 1, 2368, 2368, 1, 1] + - [192, 82.0] + - - [1408, 128, 1, 32, 1408, 1408, 32, 32] + - [191, 2465.0] + - - [32, 2944, 1, 3328, 32, 32, 3328, 3328] + - [126, 3997.0] + - - [2368, 32, 1, 256, 2368, 2368, 256, 256] + - [162, 2425.0] + - - [1024, 128, 1, 32, 1024, 1024, 32, 32] + - [179, 989.0] + - - [32, 4288, 1, 1280, 32, 32, 1280, 1280] + - [168, 4417.0] + - - [32, 5056, 1, 32, 32, 32, 32, 32] + - [145, 1116.0] + - - [5888, 32, 1, 32, 5888, 5888, 32, 32] + - [125, 1272.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [154, 5449.0] + - - [128, 704, 1, 32, 128, 128, 32, 32] + - [165, 700.0] + - - [32, 4288, 1, 3328, 32, 32, 3328, 3328] + - [126, 4231.0] + - - [1408, 64, 1, 1, 1408, 1408, 1, 1] + - [165, 22.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [164, 3188.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [127, 4341.0] + - - [1856, 128, 1, 32, 1856, 1856, 32, 32] + - [193, 1590.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [154, 6356.0] + - - [4288, 32, 1, 3328, 4288, 4288, 3328, 3328] + - [125, 4190.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [141, 5759.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [191, 3242.0] + - - [3584, 64, 1, 32, 3584, 3584, 32, 32] + - [137, 2294.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [152, 5174.0] + - - [32, 6784, 1, 3328, 32, 32, 3328, 3328] + - [126, 4323.0] + - - [32, 3584, 1, 256, 32, 32, 256, 256] + - [137, 2924.0] + - - [704, 256, 1, 32, 704, 704, 32, 32] + - [141, 1243.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [125, 5485.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [125, 5795.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [162, 3845.0] + - - [5056, 32, 1, 3328, 5056, 5056, 3328, 3328] + - [152, 4652.0] + - - [2944, 32, 1, 1280, 2944, 2944, 1280, 1280] + - [140, 3534.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [129, 4541.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [152, 4966.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [179, 4847.0] + - - [1024, 256, 1, 1, 1024, 1024, 1, 1] + - [162, 109.0] + - - [1856, 64, 1, 32, 1856, 1856, 32, 32] + - [124, 1776.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [127, 5098.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [165, 3204.0] + - - [6784, 32, 1, 32, 6784, 6784, 32, 32] + - [174, 2481.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [127, 5475.0] + - - [32, 5888, 1, 256, 32, 32, 256, 256] + - [181, 3521.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [141, 4346.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [177, 4722.0] + - - [128, 1408, 1, 1, 128, 128, 1, 1] + - [164, 94.0] + - - [32, 2368, 1, 1280, 32, 32, 1280, 1280] + - [195, 3786.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [166, 3904.0] + - - [2944, 32, 1, 32, 2944, 2944, 32, 32] + - [153, 701.0] + - - [448, 448, 1, 32, 448, 448, 32, 32] + - [125, 1384.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [152, 5171.0] + - - [64, 2944, 1, 1, 64, 64, 1, 1] + - [141, 44.0] + - - [64, 2944, 1, 32, 64, 64, 32, 32] + - [150, 1294.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [177, 5189.0] + - - [32, 3584, 1, 1280, 32, 32, 1280, 1280] + - [143, 4163.0] + - - [32, 2944, 1, 32, 32, 32, 32, 32] + - [124, 704.0] + - - [32, 6784, 1, 256, 32, 32, 256, 256] + - [140, 3227.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [179, 5495.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [143, 4108.0] + - - [32, 3584, 1, 3328, 32, 32, 3328, 3328] + - [126, 4013.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [156, 4096.0] + - - [64, 4288, 1, 1, 64, 64, 1, 1] + - [162, 65.0] + - - [3584, 32, 1, 32, 3584, 3584, 32, 32] + - [126, 830.0] + - - [3584, 64, 1, 1, 3584, 3584, 1, 1] + - [127, 54.0] + - - [32, 4288, 1, 32, 32, 32, 32, 32] + - [192, 998.0] + - - [64, 1408, 1, 1, 64, 64, 1, 1] + - [151, 23.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [162, 4335.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [152, 4936.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [125, 5628.0] + - - [64, 3584, 1, 1, 64, 64, 1, 1] + - [191, 56.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [127, 4934.0] + - - [2368, 32, 1, 32, 2368, 2368, 32, 32] + - [126, 586.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [152, 3782.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [152, 5408.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [193, 4802.0] + - - [32, 2368, 1, 3328, 32, 32, 3328, 3328] + - [143, 3801.0] + - - [128, 1856, 1, 1, 128, 128, 1, 1] + - [164, 121.0] + - - [128, 1856, 1, 32, 128, 128, 32, 32] + - [154, 1597.0] + - - [3584, 32, 1, 256, 3584, 3584, 256, 256] + - [189, 2913.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [143, 4829.0] + - - [32, 2944, 1, 1280, 32, 32, 1280, 1280] + - [126, 3540.0] + - - [4288, 32, 1, 32, 4288, 4288, 32, 32] + - [177, 959.0] + - - [1856, 64, 1, 1, 1856, 1856, 1, 1] + - [122, 30.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [127, 5177.0] + - - [1408, 128, 1, 1, 1408, 1408, 1, 1] + - [122, 43.0] + - - [5056, 32, 1, 256, 5056, 5056, 256, 256] + - [164, 3367.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [152, 3711.0] + - - [3584, 32, 1, 1280, 3584, 3584, 1280, 1280] + - [164, 4209.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [193, 6335.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [141, 5011.0] + - - [1856, 128, 1, 1, 1856, 1856, 1, 1] + - [152, 59.0] + - - [256, 704, 1, 1, 256, 256, 1, 1] + - [127, 44.0] + - - [1024, 128, 1, 1, 1024, 1024, 1, 1] + - [151, 34.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [141, 4737.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [191, 5440.0] + - - [64, 2368, 1, 32, 64, 64, 32, 32] + - [154, 1123.0] + - - [32, 2368, 1, 256, 32, 32, 256, 256] + - [137, 2413.0] + - - [32, 6784, 1, 1280, 32, 32, 1280, 1280] + - [143, 4386.0] + - - [32, 6784, 1, 32, 32, 32, 32, 32] + - [175, 1389.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [166, 6205.0] + - - [32, 5888, 1, 1280, 32, 32, 1280, 1280] + - [156, 4053.0] + - - [448, 256, 1, 1, 448, 448, 1, 1] + - [122, 71.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [177, 5317.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [123, 4645.0] + - - [2368, 32, 1, 3328, 2368, 2368, 3328, 3328] + - [177, 3995.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [152, 5191.0] + - - [128, 1024, 1, 32, 128, 128, 32, 32] + - [154, 928.0] + - - [32, 2368, 1, 32, 32, 32, 32, 32] + - [136, 575.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [177, 4927.0] + - - [32, 3584, 1, 32, 32, 32, 32, 32] + - [175, 838.0] + - - [704, 256, 1, 1, 704, 704, 1, 1] + - [190, 83.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [193, 5521.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [154, 6079.0] + - - [448, 256, 1, 32, 448, 448, 32, 32] + - [126, 830.0] + - - [64, 4288, 1, 32, 64, 64, 32, 32] + - [154, 1756.0] + - - [128, 704, 1, 1, 128, 128, 1, 1] + - [122, 23.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [193, 5477.0] + - - [448, 448, 1, 1, 448, 448, 1, 1] + - [127, 47.0] + - - [32, 5888, 1, 32, 32, 32, 32, 32] + - [123, 1230.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [179, 5233.0] + - - [4288, 64, 1, 32, 4288, 4288, 32, 32] + - [141, 1756.0] + - - [2368, 64, 1, 32, 2368, 2368, 32, 32] + - [127, 1144.0] + - - [64, 1408, 1, 32, 64, 64, 32, 32] + - [124, 707.0] + - - [32, 2944, 1, 256, 32, 32, 256, 256] + - [140, 3233.0] + - - [2944, 64, 1, 1, 2944, 2944, 1, 1] + - [152, 107.0] + - - [2944, 64, 1, 32, 2944, 2944, 32, 32] + - [125, 1316.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [125, 4795.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [191, 4564.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [150, 4079.0] + - - [6784, 32, 1, 1280, 6784, 6784, 1280, 1280] + - [139, 4119.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [152, 4936.0] + - - [2944, 32, 1, 256, 2944, 2944, 256, 256] + - [162, 2610.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [177, 5529.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [152, 5129.0] + - - [5888, 32, 1, 256, 5888, 5888, 256, 256] + - [164, 3490.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [162, 2695.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [141, 4408.0] + - - [64, 1856, 1, 32, 64, 64, 32, 32] + - [151, 914.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [152, 5311.0] + - - [5888, 32, 1, 1280, 5888, 5888, 1280, 1280] + - [191, 4090.0] + - - [256, 704, 1, 32, 256, 256, 32, 32] + - [152, 1207.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [125, 5092.0] + - - [1408, 64, 1, 32, 1408, 1408, 32, 32] + - [153, 677.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [152, 5057.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [141, 6346.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [179, 5803.0] + - - [32, 5056, 1, 3328, 32, 32, 3328, 3328] + - [126, 4313.0] + - - [64, 1856, 1, 1, 64, 64, 1, 1] + - [124, 29.0] + - - [704, 128, 1, 32, 704, 704, 32, 32] + - [153, 714.0] + - - [4288, 64, 1, 1, 4288, 4288, 1, 1] + - [152, 64.0] + - - [5056, 32, 1, 1280, 5056, 5056, 1280, 1280] + - [139, 4245.0] + - - [128, 1024, 1, 1, 128, 128, 1, 1] + - [126, 33.0] + - - [256, 1024, 1, 1, 256, 256, 1, 1] + - [164, 61.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [177, 3662.0] + - - [1024, 256, 1, 32, 1024, 1024, 32, 32] + - [122, 2741.0] + - - [2368, 32, 1, 1280, 2368, 2368, 1280, 1280] + - [177, 3577.0] + - - [704, 128, 1, 1, 704, 704, 1, 1] + - [165, 23.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [139, 3277.0] + - - [32, 4288, 1, 256, 32, 32, 256, 256] + - [181, 3142.0] + - - [128, 1408, 1, 32, 128, 128, 32, 32] + - [191, 1222.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [150, 4635.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [152, 4452.0] + - - [4288, 32, 1, 1280, 4288, 4288, 1280, 1280] + - [139, 3845.0] + - - [32, 5056, 1, 256, 32, 32, 256, 256] + - [143, 3077.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [189, 2740.0] + - - [256, 1024, 1, 32, 256, 256, 32, 32] + - [152, 1638.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [125, 5421.0] + - - [6784, 32, 1, 256, 6784, 6784, 256, 256] + - [191, 3482.0] + - - [64, 2368, 1, 1, 64, 64, 1, 1] + - [126, 38.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [162, 3794.0] + - - [5888, 32, 1, 3328, 5888, 5888, 3328, 3328] + - [152, 4448.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [166, 5939.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [125, 5387.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [177, 5336.0] + - - [2944, 32, 1, 3328, 2944, 2944, 3328, 3328] + - [126, 3813.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [127, 5452.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [166, 4923.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [175, 3779.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [193, 6060.0] + - - [256, 448, 1, 1, 256, 256, 1, 1] + - [176, 30.0] + - - [256, 448, 1, 32, 256, 256, 32, 32] + - [125, 846.0] + - - [64, 3584, 1, 32, 64, 64, 32, 32] + - [177, 1536.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [141, 5784.0] + - - [4288, 32, 1, 256, 4288, 4288, 256, 256] + - [191, 2997.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [177, 4864.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [166, 4693.0] + - - [3584, 32, 1, 3328, 3584, 3584, 3328, 3328] + - [177, 4365.0] + - - [6784, 32, 1, 3328, 6784, 6784, 3328, 3328] + - [177, 4708.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [164, 3810.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [189, 2633.0] + - - [5056, 32, 1, 32, 5056, 5056, 32, 32] + - [166, 1578.0] + - - [32, 5888, 1, 3328, 32, 32, 3328, 3328] + - [143, 4228.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [189, 4826.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [162, 5008.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [137, 5159.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [181, 6226.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [195, 4011.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [164, 3898.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [141, 4930.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [141, 5400.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [141, 5459.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [143, 4265.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [141, 5970.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [166, 6178.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [192, 3503.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [195, 4340.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [195, 4501.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [141, 4830.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [177, 4568.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [139, 4766.0] + - - [511, 512, 1, 512, 511, 511, 512, 512] + - [141, 4847.0] + - - [512, 512, 1, 511, 512, 512, 511, 511] + - [175, 5055.0] + - - [512, 513, 1, 512, 512, 512, 512, 512] + - [139, 4739.0] + - - [512, 511, 1, 512, 512, 512, 512, 512] + - [166, 5117.0] + - - [513, 512, 1, 512, 513, 513, 512, 512] + - [141, 4459.0] + - - [512, 512, 1, 513, 512, 512, 513, 513] + - [127, 4951.0] + - - [512, 512, 1, 64, 512, 512, 64, 64] + - [152, 2589.0] + - - [33, 33, 1600, 32, 33, 33, 32, 32] + - [174, 1984.0] + - - [256, 684, 1, 1024, 256, 256, 1024, 1024] + - [191, 4586.0] + - - [1024, 200, 1, 560, 1024, 1024, 560, 560] + - [189, 4595.0] + - - [2048, 114, 1, 512, 2048, 2048, 512, 512] + - [141, 4379.0] + - - [2048, 114, 1, 768, 2048, 2048, 768, 768] + - [141, 4581.0] + - - [32, 32, 4608, 64, 32, 32, 64, 64] + - [195, 4024.0] + - - [32, 35, 4608, 64, 32, 32, 64, 64] + - [181, 3587.0] + - - [34, 34, 4736, 64, 34, 34, 64, 64] + - [189, 2202.0] + - - [35, 35, 4608, 64, 35, 35, 64, 64] + - [189, 2305.0] + - - [33, 33, 1920, 64, 33, 33, 64, 64] + - [150, 2113.0] + - - [480, 512, 1, 512, 480, 480, 512, 512] + - [141, 4599.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [139, 5045.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [191, 4526.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [141, 6147.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [164, 4940.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [191, 4815.0] + - - [128, 864, 1, 256, 128, 128, 256, 256] + - [178, 3478.0] + - - [256, 864, 1, 512, 256, 256, 512, 512] + - [141, 5092.0] + - - [1152, 128, 1, 784, 1152, 1152, 784, 784] + - [179, 5689.0] + - - [256, 512, 1, 784, 256, 256, 784, 784] + - [175, 5386.0] + - - [512, 256, 1, 784, 512, 512, 784, 784] + - [123, 5495.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [141, 4525.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [168, 4295.0] + - - [1024, 256, 1, 3800, 1024, 1024, 3800, 3800] + - [175, 5740.0] + - - [1024, 256, 1, 3400, 1024, 1024, 3400, 3400] + - [189, 5724.0] + - - [256, 1024, 1, 3400, 256, 256, 3400, 3400] + - [175, 5700.0] + - - [1024, 256, 1, 3220, 1024, 1024, 3220, 3220] + - [175, 5700.0] + - - [256, 1024, 1, 3220, 256, 256, 3220, 3220] + - [150, 5794.0] + - - [1024, 256, 1, 3456, 1024, 1024, 3456, 3456] + - [194, 5761.0] + - - [256, 1024, 1, 3456, 256, 256, 3456, 3456] + - [123, 5809.0] + - - [1024, 256, 1, 3072, 1024, 1024, 3072, 3072] + - [193, 5382.0] + - - [256, 1024, 1, 3072, 256, 256, 3072, 3072] + - [141, 5451.0] + - - [1024, 256, 1, 3552, 1024, 1024, 3552, 3552] + - [177, 5687.0] + - - [256, 1024, 1, 3552, 256, 256, 3552, 3552] + - [177, 5692.0] + - - [256, 1024, 1, 2852, 256, 256, 2852, 2852] + - [150, 5851.0] + - - [1024, 256, 1, 2852, 1024, 1024, 2852, 2852] + - [150, 6148.0] + - - [256, 512, 1, 10752, 256, 256, 10752, 10752] + - [141, 5052.0] + - - [256, 1024, 1, 3800, 256, 256, 3800, 3800] + - [123, 5672.0] + - - [256, 512, 1, 10560, 256, 256, 10560, 10560] + - [175, 5511.0] + - - [256, 1024, 1, 2992, 256, 256, 2992, 2992] + - [152, 5652.0] + - - [256, 1024, 1, 2688, 256, 256, 2688, 2688] + - [123, 5931.0] + - - [1024, 256, 1, 2688, 1024, 1024, 2688, 2688] + - [150, 5771.0] + - - [256, 1024, 1, 2904, 256, 256, 2904, 2904] + - [175, 5663.0] + - - [1024, 256, 1, 2904, 1024, 1024, 2904, 2904] + - [150, 5597.0] + - - [256, 1024, 1, 2640, 256, 256, 2640, 2640] + - [150, 5817.0] + - - [1024, 256, 1, 2640, 1024, 1024, 2640, 2640] + - [177, 5535.0] + - - [1024, 256, 1, 4032, 1024, 1024, 4032, 4032] + - [189, 5995.0] + - - [1024, 256, 1, 2992, 1024, 1024, 2992, 2992] + - [152, 5634.0] + - - [256, 1024, 1, 3360, 256, 256, 3360, 3360] + - [125, 5695.0] + - - [1024, 256, 1, 3360, 1024, 1024, 3360, 3360] + - [123, 6242.0] + - - [1024, 256, 1, 3500, 1024, 1024, 3500, 3500] + - [175, 5619.0] + - - [256, 1024, 1, 3500, 256, 256, 3500, 3500] + - [150, 5729.0] + - - [1024, 256, 1, 3168, 1024, 1024, 3168, 3168] + - [123, 5681.0] + - - [256, 1024, 1, 3168, 256, 256, 3168, 3168] + - [152, 5700.0] + - - [256, 1024, 1, 3036, 256, 256, 3036, 3036] + - [150, 5665.0] + - - [1024, 256, 1, 4200, 1024, 1024, 4200, 4200] + - [162, 5729.0] + - - [1024, 256, 1, 3600, 1024, 1024, 3600, 3600] + - [177, 5589.0] + - - [256, 1024, 1, 3600, 256, 256, 3600, 3600] + - [175, 5750.0] + - - [256, 1024, 1, 2944, 256, 256, 2944, 2944] + - [123, 5913.0] + - - [1024, 256, 1, 2944, 1024, 1024, 2944, 2944] + - [177, 5703.0] + - - [1024, 256, 1, 3700, 1024, 1024, 3700, 3700] + - [189, 5995.0] + - - [256, 1024, 1, 2352, 256, 256, 2352, 2352] + - [168, 6852.0] + - - [1024, 256, 1, 2352, 1024, 1024, 2352, 2352] + - [123, 5685.0] + - - [1024, 256, 1, 2816, 1024, 1024, 2816, 2816] + - [193, 5511.0] + - - [256, 1024, 1, 3700, 256, 256, 3700, 3700] + - [175, 5854.0] + - - [256, 1024, 1, 2816, 256, 256, 2816, 2816] + - [141, 5488.0] + - - [256, 512, 1, 11408, 256, 256, 11408, 11408] + - [127, 5545.0] + - - [1024, 256, 1, 3036, 1024, 1024, 3036, 3036] + - [189, 5751.0] + - - [1024, 256, 1, 3264, 1024, 1024, 3264, 3264] + - [177, 5720.0] + - - [256, 1024, 1, 3264, 256, 256, 3264, 3264] + - [125, 5727.0] + - - [1024, 256, 1, 3864, 1024, 1024, 3864, 3864] + - [175, 5567.0] + - - [256, 1024, 1, 4032, 256, 256, 4032, 4032] + - [123, 5980.0] + - - [1024, 256, 1, 3128, 1024, 1024, 3128, 3128] + - [123, 5616.0] + - - [256, 1024, 1, 3128, 256, 256, 3128, 3128] + - [150, 5834.0] + - - [256, 1024, 1, 3200, 256, 256, 3200, 3200] + - [150, 5898.0] + - - [256, 512, 1, 11616, 256, 256, 11616, 11616] + - [123, 5506.0] + - - [1024, 256, 1, 3200, 1024, 1024, 3200, 3200] + - [175, 5869.0] + - - [1024, 256, 1, 4000, 1024, 1024, 4000, 4000] + - [189, 5907.0] + - - [256, 1024, 1, 2520, 256, 256, 2520, 2520] + - [150, 5824.0] + - - [1024, 256, 1, 2520, 1024, 1024, 2520, 2520] + - [175, 6240.0] + - - [256, 1024, 1, 2976, 256, 256, 2976, 2976] + - [125, 5684.0] + - - [256, 1024, 1, 2400, 256, 256, 2400, 2400] + - [150, 5883.0] + - - [1024, 256, 1, 2400, 1024, 1024, 2400, 2400] + - [150, 5926.0] + - - [1024, 256, 1, 3696, 1024, 1024, 3696, 3696] + - [177, 5588.0] + - - [1024, 256, 1, 3900, 1024, 1024, 3900, 3900] + - [175, 5652.0] + - - [1024, 256, 1, 3772, 1024, 1024, 3772, 3772] + - [189, 5665.0] + - - [256, 1024, 1, 3696, 256, 256, 3696, 3696] + - [175, 5814.0] + - - [256, 1024, 1, 2728, 256, 256, 2728, 2728] + - [150, 5854.0] + - - [1024, 256, 1, 2728, 1024, 1024, 2728, 2728] + - [189, 5798.0] + - - [1024, 256, 1, 2480, 1024, 1024, 2480, 2480] + - [123, 6396.0] + - - [256, 1024, 1, 2480, 256, 256, 2480, 2480] + - [177, 5616.0] + - - [1024, 256, 1, 2880, 1024, 1024, 2880, 2880] + - [177, 5708.0] + - - [512, 256, 1, 3220, 512, 512, 3220, 3220] + - [127, 5418.0] + - - [256, 1024, 1, 2880, 256, 256, 2880, 2880] + - [152, 5716.0] + - - [256, 1024, 1, 4200, 256, 256, 4200, 4200] + - [175, 5663.0] + - - [1024, 256, 1, 3648, 1024, 1024, 3648, 3648] + - [177, 5778.0] + - - [1024, 256, 1, 3312, 1024, 1024, 3312, 3312] + - [152, 5641.0] + - - [256, 1024, 1, 3648, 256, 256, 3648, 3648] + - [152, 5731.0] + - - [1024, 256, 1, 3300, 1024, 1024, 3300, 3300] + - [123, 5639.0] + - - [1024, 256, 1, 3528, 1024, 1024, 3528, 3528] + - [175, 5599.0] + - - [256, 1024, 1, 2604, 256, 256, 2604, 2604] + - [152, 5627.0] + - - [1024, 256, 1, 2604, 1024, 1024, 2604, 2604] + - [175, 5612.0] + - - [512, 256, 1, 11408, 512, 512, 11408, 11408] + - [123, 6676.0] + - - [256, 1024, 1, 3312, 256, 256, 3312, 3312] + - [152, 5658.0] + - - [256, 1024, 1, 3300, 256, 256, 3300, 3300] + - [150, 5690.0] + - - [512, 256, 1, 3072, 512, 512, 3072, 3072] + - [141, 4920.0] + - - [256, 1024, 1, 3528, 256, 256, 3528, 3528] + - [150, 5739.0] + - - [1024, 256, 1, 2976, 1024, 1024, 2976, 2976] + - [168, 7152.0] + - - [1024, 256, 1, 2760, 1024, 1024, 2760, 2760] + - [123, 5631.0] + - - [512, 256, 1, 3800, 512, 512, 3800, 3800] + - [123, 5266.0] + - - [256, 1024, 1, 2760, 256, 256, 2760, 2760] + - [150, 5847.0] + - - [1024, 256, 1, 2160, 1024, 1024, 2160, 2160] + - [162, 5890.0] + - - [256, 1024, 1, 2160, 256, 256, 2160, 2160] + - [150, 5853.0] + - - [512, 256, 1, 11616, 512, 512, 11616, 11616] + - [175, 5496.0] + - - [512, 256, 1, 2852, 512, 512, 2852, 2852] + - [127, 5271.0] + - - [256, 1024, 1, 3864, 256, 256, 3864, 3864] + - [150, 5635.0] + - - [512, 256, 1, 2640, 512, 512, 2640, 2640] + - [127, 5338.0] + - - [256, 1024, 1, 4000, 256, 256, 4000, 4000] + - [123, 5924.0] + - - [512, 256, 1, 2904, 512, 512, 2904, 2904] + - [150, 5227.0] + - - [256, 1024, 1, 3900, 256, 256, 3900, 3900] + - [175, 5693.0] + - - [512, 256, 1, 2688, 512, 512, 2688, 2688] + - [150, 5540.0] + - - [256, 1024, 1, 3772, 256, 256, 3772, 3772] + - [177, 5650.0] + - - [512, 256, 1, 3400, 512, 512, 3400, 3400] + - [137, 5359.0] + - - [512, 256, 1, 3456, 512, 512, 3456, 3456] + - [123, 5385.0] + - - [512, 256, 1, 3552, 512, 512, 3552, 3552] + - [154, 5424.0] + - - [128, 64, 25, 6498, 128, 128, 6498, 6498] + - [125, 5971.0] + - - [128, 64, 25, 6859, 128, 128, 6859, 6859] + - [125, 4441.0] + - - [64, 64, 64, 3042, 64, 64, 3042, 3042] + - [150, 5496.0] + - - [64, 64, 64, 3211, 64, 64, 3211, 3211] + - [125, 5511.0] + - - [64, 64, 49, 4050, 64, 64, 4050, 4050] + - [164, 5886.0] + - - [64, 64, 49, 4275, 64, 64, 4275, 4275] + - [125, 5780.0] + - - [64, 64, 36, 6498, 64, 64, 6498, 6498] + - [179, 5719.0] + - - [64, 64, 36, 6859, 64, 64, 6859, 6859] + - [179, 5891.0] + - - [1152, 128, 1, 1444, 1152, 1152, 1444, 1444] + - [179, 5313.0] + - - [512, 256, 1, 361, 512, 512, 361, 361] + - [141, 3810.0] + - - [576, 128, 1, 1444, 576, 576, 1444, 1444] + - [137, 3669.0] + - - [29000, 35, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 3906.0] + - - [29000, 36, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 4019.0] + - - [29000, 39, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 4339.0] + - - [29000, 40, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 4440.0] + - - [29000, 42, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 4608.0] + - - [29000, 43, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 4675.0] + - - [29000, 44, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 4779.0] + - - [29000, 46, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 4899.0] + - - [29000, 48, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 5053.0] + - - [29000, 49, 1, 2560, 29000, 29000, 2560, 2560] + - [193, 5112.0] + - - [29000, 50, 1, 2560, 29000, 29000, 2560, 2560] + - [193, 5239.0] + - - [29000, 51, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 5310.0] + - - [29000, 53, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 5495.0] + - - [29000, 54, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 5475.0] + - - [29000, 55, 1, 2560, 29000, 29000, 2560, 2560] + - [193, 5607.0] + - - [29000, 56, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 5795.0] + - - [29000, 57, 1, 2560, 29000, 29000, 2560, 2560] + - [141, 5788.0] + - - [29000, 58, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 5738.0] + - - [29000, 59, 1, 2560, 29000, 29000, 2560, 2560] + - [193, 5910.0] + - - [29000, 61, 1, 2560, 29000, 29000, 2560, 2560] + - [193, 6048.0] + - - [29000, 63, 1, 2560, 29000, 29000, 2560, 2560] + - [166, 6153.0] + - - [256, 128, 1, 13600, 256, 256, 13600, 13600] + - [201, 5194.0] + - - [256, 128, 1, 12880, 256, 256, 12880, 12880] + - [208, 5059.0] + - - [128, 512, 1, 15200, 128, 128, 15200, 15200] + - [198, 5508.0] + - - [512, 128, 1, 15200, 512, 512, 15200, 15200] + - [214, 5303.0] + - - [128, 512, 1, 11408, 128, 128, 11408, 11408] + - [198, 5335.0] + - - [256, 128, 1, 13824, 256, 256, 13824, 13824] + - [212, 4869.0] + - - [128, 512, 1, 11616, 128, 128, 11616, 11616] + - [198, 5241.0] + - - [256, 128, 1, 14208, 256, 256, 14208, 14208] + - [206, 4916.0] + - - [128, 512, 1, 14208, 128, 128, 14208, 14208] + - [189, 5466.0] + - - [256, 128, 1, 15200, 256, 256, 15200, 15200] + - [208, 5299.0] + - - [512, 128, 1, 11408, 512, 512, 11408, 11408] + - [207, 5245.0] + - - [512, 128, 1, 16800, 512, 512, 16800, 16800] + - [198, 5231.0] + - - [128, 512, 1, 11264, 128, 128, 11264, 11264] + - [201, 4972.0] + - - [512, 128, 1, 11616, 512, 512, 11616, 11616] + - [207, 8931.0] + - - [512, 128, 1, 16128, 512, 512, 16128, 16128] + - [216, 4906.0] + - - [512, 128, 1, 11968, 512, 512, 11968, 11968] + - [123, 5824.0] + - - [128, 512, 1, 11968, 128, 128, 11968, 11968] + - [123, 5655.0] + - - [512, 128, 1, 12288, 512, 512, 12288, 12288] + - [216, 4619.0] + - - [128, 512, 1, 12288, 128, 128, 12288, 12288] + - [212, 4889.0] + - - [128, 512, 1, 12672, 128, 128, 12672, 12672] + - [219, 5391.0] + - - [512, 128, 1, 11776, 512, 512, 11776, 11776] + - [208, 4654.0] + - - [512, 128, 1, 12144, 512, 512, 12144, 12144] + - [214, 5299.0] + - - [512, 128, 1, 11264, 512, 512, 11264, 11264] + - [208, 4691.0] + - - [128, 512, 1, 12144, 128, 128, 12144, 12144] + - [198, 5311.0] + - - [512, 128, 1, 12672, 512, 512, 12672, 12672] + - [215, 5262.0] + - - [128, 512, 1, 12512, 128, 128, 12512, 12512] + - [162, 5377.0] + - - [128, 512, 1, 11776, 128, 128, 11776, 11776] + - [201, 4968.0] + - - [256, 128, 1, 12288, 256, 256, 12288, 12288] + - [206, 4297.0] + - - [40, 40, 1, 1909283, 40, 40, 1909283, 1909283] + - [210, 691.0] + - - [40, 40, 1, 3818566, 40, 40, 3818566, 3818566] + - [218, 743.0] + - - [5888, 1, 1, 3328, 5888, 5888, 3328, 3328] + - [229, 239.0] + - - [5056, 1, 1, 3328, 5056, 5056, 3328, 3328] + - [226, 245.0] + - - [6784, 1, 1, 1280, 6784, 6784, 1280, 1280] + - [226, 220.0] + - - [2944, 1, 1, 3328, 2944, 2944, 3328, 3328] + - [232, 212.0] + - - [3584, 1, 1, 1280, 3584, 3584, 1280, 1280] + - [229, 207.0] + - - [6784, 1, 1, 256, 6784, 6784, 256, 256] + - [226, 159.0] + - - [4288, 1, 1, 1280, 4288, 4288, 1280, 1280] + - [226, 209.0] + - - [5056, 1, 1, 1280, 5056, 5056, 1280, 1280] + - [229, 247.0] + - - [3584, 1, 1, 256, 3584, 3584, 256, 256] + - [225, 190.0] + - - [6784, 1, 1, 3328, 6784, 6784, 3328, 3328] + - [232, 234.0] + - - [1408, 1, 1, 1280, 1408, 1408, 1280, 1280] + - [228, 125.0] + - - [1408, 32, 1, 3328, 1408, 1408, 3328, 3328] + - [163, 3074.0] + - - [4288, 1, 1, 256, 4288, 4288, 256, 256] + - [182, 148.0] + - - [2368, 1, 1, 256, 2368, 2368, 256, 256] + - [229, 92.0] + - - [1856, 32, 1, 32, 1856, 1856, 32, 32] + - [151, 457.0] + - - [5056, 1, 1, 256, 5056, 5056, 256, 256] + - [226, 152.0] + - - [5056, 1, 1, 1, 5056, 5056, 1, 1] + - [222, 1.0] + - - [1408, 1, 1, 256, 1408, 1408, 256, 256] + - [229, 57.0] + - - [1408, 1, 1, 1, 1408, 1408, 1, 1] + - [160, 1.0] + - - [4288, 1, 1, 3328, 4288, 4288, 3328, 3328] + - [226, 228.0] + - - [2368, 1, 1, 1280, 2368, 2368, 1280, 1280] + - [226, 180.0] + - - [1856, 1, 1, 1, 1856, 1856, 1, 1] + - [226, 0.49] + - - [1856, 32, 1, 256, 1856, 1856, 256, 256] + - [190, 2847.0] + - - [1408, 32, 1, 32, 1408, 1408, 32, 32] + - [138, 379.0] + - - [1856, 32, 1, 1280, 1856, 1856, 1280, 1280] + - [144, 2904.0] + - - [1408, 1, 1, 3328, 1408, 1408, 3328, 3328] + - [228, 154.0] + - - [5888, 1, 1, 256, 5888, 5888, 256, 256] + - [225, 148.0] + - - [5888, 1, 1, 1, 5888, 5888, 1, 1] + - [138, 3.0] + - - [1856, 32, 1, 3328, 1856, 1856, 3328, 3328] + - [176, 3541.0] + - - [2368, 1, 1, 3328, 2368, 2368, 3328, 3328] + - [226, 215.0] + - - [6784, 1, 1, 1, 6784, 6784, 1, 1] + - [223, 4.0] + - - [5888, 1, 1, 1280, 5888, 5888, 1280, 1280] + - [229, 224.0] + - - [2944, 1, 1, 256, 2944, 2944, 256, 256] + - [227, 111.0] + - - [2944, 1, 1, 1, 2944, 2944, 1, 1] + - [222, 1.0] + - - [1408, 32, 1, 1280, 1408, 1408, 1280, 1280] + - [176, 2789.0] + - - [1856, 1, 1, 1280, 1856, 1856, 1280, 1280] + - [228, 179.0] + - - [3584, 1, 1, 1, 3584, 3584, 1, 1] + - [224, 2.0] + - - [2944, 1, 1, 1280, 2944, 2944, 1280, 1280] + - [226, 185.0] + - - [3584, 1, 1, 3328, 3584, 3584, 3328, 3328] + - [232, 230.0] + - - [1856, 1, 1, 3328, 1856, 1856, 3328, 3328] + - [228, 195.0] + - - [4288, 1, 1, 1, 4288, 4288, 1, 1] + - [231, 2.0] + - - [1856, 1, 1, 256, 1856, 1856, 256, 256] + - [233, 99.0] + - - [1408, 32, 1, 256, 1408, 1408, 256, 256] + - [163, 1580.0] + - - [2368, 1, 1, 1, 2368, 2368, 1, 1] + - [222, 1.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [124, 3041.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [196, 2831.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [169, 2905.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [163, 3126.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [230, 2056.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [196, 3134.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [169, 3010.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [144, 2619.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [196, 2974.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [196, 2796.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [169, 2281.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [226, 385.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [232, 880.0] + - - [32768, 1, 1, 256, 32768, 32768, 256, 256] + - [226, 250.0] + - - [1600, 1, 1, 1024, 1600, 1600, 1024, 1024] + - [226, 125.0] + - - [3456, 1, 1, 256, 3456, 3456, 256, 256] + - [225, 123.0] + - - [4096, 1, 1, 256, 4096, 4096, 256, 256] + - [226, 134.0] + - - [6912, 1, 1, 256, 6912, 6912, 256, 256] + - [226, 161.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [226, 1392.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [226, 442.0] + - - [29000, 27, 1, 2560, 29000, 29000, 2560, 2560] + - [169, 2844.0] + - - [1, 4288, 1, 1280, 1, 1, 1280, 1280] + - [136, 200.0] + - - [32, 1408, 1, 32, 32, 32, 32, 32] + - [174, 901.0] + - - [1, 1408, 1, 3328, 1, 1, 3328, 3328] + - [142, 173.0] + - - [1, 2368, 1, 1280, 1, 1, 1280, 1280] + - [240, 173.0] + - - [1, 5888, 1, 3328, 1, 1, 3328, 3328] + - [235, 200.0] + - - [1, 1856, 1, 256, 1, 1, 256, 256] + - [136, 75.0] + - - [1, 3584, 1, 3328, 1, 1, 3328, 3328] + - [161, 201.0] + - - [1, 6784, 1, 3328, 1, 1, 3328, 3328] + - [243, 210.0] + - - [1, 2368, 1, 256, 1, 1, 256, 256] + - [161, 89.0] + - - [32, 1856, 1, 3328, 32, 32, 3328, 3328] + - [237, 4042.0] + - - [1, 2944, 1, 1280, 1, 1, 1280, 1280] + - [238, 185.0] + - - [1, 1856, 1, 3328, 1, 1, 3328, 3328] + - [128, 172.0] + - - [1, 1408, 1, 1, 1, 1, 1, 1] + - [131, 0.37] + - - [1, 6784, 1, 256, 1, 1, 256, 256] + - [239, 141.0] + - - [1, 6784, 1, 1, 1, 1, 1, 1] + - [122, 2.0] + - - [1, 4288, 1, 3328, 1, 1, 3328, 3328] + - [136, 194.0] + - - [1, 2368, 1, 3328, 1, 1, 3328, 3328] + - [188, 180.0] + - - [1, 5888, 1, 1280, 1, 1, 1280, 1280] + - [239, 189.0] + - - [1, 2944, 1, 256, 1, 1, 256, 256] + - [235, 101.0] + - - [1, 6784, 1, 1280, 1, 1, 1280, 1280] + - [242, 187.0] + - - [1, 5056, 1, 1, 1, 1, 1, 1] + - [122, 1.0] + - - [32, 1856, 1, 32, 32, 32, 32, 32] + - [174, 503.0] + - - [32, 1408, 1, 256, 32, 32, 256, 256] + - [236, 1648.0] + - - [1, 5888, 1, 1, 1, 1, 1, 1] + - [122, 1.0] + - - [1, 2944, 1, 3328, 1, 1, 3328, 3328] + - [234, 216.0] + - - [1, 3584, 1, 1, 1, 1, 1, 1] + - [122, 1.0] + - - [1, 1408, 1, 256, 1, 1, 256, 256] + - [128, 58.0] + - - [1, 1856, 1, 1, 1, 1, 1, 1] + - [124, 0.47] + - - [1, 5056, 1, 1280, 1, 1, 1280, 1280] + - [242, 198.0] + - - [1, 5888, 1, 256, 1, 1, 256, 256] + - [242, 141.0] + - - [32, 1856, 1, 1280, 32, 32, 1280, 1280] + - [237, 3358.0] + - - [1, 2368, 1, 1, 1, 1, 1, 1] + - [122, 1.0] + - - [1, 1408, 1, 1280, 1, 1, 1280, 1280] + - [167, 127.0] + - - [1, 5056, 1, 256, 1, 1, 256, 256] + - [235, 137.0] + - - [1, 3584, 1, 1280, 1, 1, 1280, 1280] + - [242, 186.0] + - - [1, 4288, 1, 256, 1, 1, 256, 256] + - [235, 125.0] + - - [1, 4288, 1, 1, 1, 1, 1, 1] + - [122, 1.0] + - - [1, 2944, 1, 1, 1, 1, 1, 1] + - [122, 1.0] + - - [32, 1408, 1, 3328, 32, 32, 3328, 3328] + - [241, 3203.0] + - - [1, 5056, 1, 3328, 1, 1, 3328, 3328] + - [235, 216.0] + - - [32, 1856, 1, 256, 32, 32, 256, 256] + - [241, 2077.0] + - - [1, 1856, 1, 1280, 1, 1, 1280, 1280] + - [136, 146.0] + - - [1, 3584, 1, 256, 1, 1, 256, 256] + - [235, 116.0] + - - [32, 1408, 1, 1280, 32, 32, 1280, 1280] + - [180, 3145.0] + - - [2, 2048, 1, 1024, 2, 2, 1024, 1024] + - [167, 269.0] + - - [32, 1600, 1, 512, 32, 32, 512, 512] + - [167, 2632.0] + - - [1, 4096, 1, 256, 1, 1, 256, 256] + - [194, 133.0] + - - [1, 6912, 1, 256, 1, 1, 256, 256] + - [239, 145.0] + - - [2, 2048, 1, 768, 2, 2, 768, 768] + - [136, 257.0] + - - [2, 4608, 1, 768, 2, 2, 768, 768] + - [235, 341.0] + - - [2, 4608, 1, 1024, 2, 2, 1024, 1024] + - [142, 325.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [202, 2456.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [213, 1425.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [204, 2669.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [217, 1231.0] + - - [147, 64, 1, 12544, 147, 147, 12544, 12544] + - [199, 2551.0] + - - [256, 128, 1, 10752, 256, 256, 10752, 10752] + - [200, 3617.0] + - - [256, 128, 1, 10560, 256, 256, 10560, 10560] + - [214, 4921.0] + - - [256, 128, 1, 11408, 256, 256, 11408, 11408] + - [211, 4658.0] + - - [256, 12, 1, 11408, 256, 256, 11408, 11408] + - [209, 999.0] + - - [256, 128, 1, 11616, 256, 256, 11616, 11616] + - [207, 4760.0] + - - [256, 12, 1, 11616, 256, 256, 11616, 11616] + - [202, 1002.0] + - - [256, 12, 1, 12288, 256, 256, 12288, 12288] + - [220, 1000.0] + - - [576, 64, 1, 5625, 576, 576, 5625, 5625] + - [214, 4484.0] + - - [147, 64, 1, 22500, 147, 147, 22500, 22500] + - [203, 2859.0] + - - [11, 11, 1, 1909283, 11, 11, 1909283, 1909283] + - [205, 41.0] + - - [11, 11, 1, 3818566, 11, 11, 3818566, 3818566] + - [221, 53.0] + - - [448, 1, 1, 256, 448, 448, 256, 256] + - [138, 27.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [181, 3476.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [136, 2280.0] + - - [448, 64, 1, 1, 448, 448, 1, 1] + - [122, 7.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [177, 3591.0] + - - [1024, 1, 1, 3328, 1024, 1024, 3328, 3328] + - [128, 118.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [143, 3644.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [139, 2549.0] + - - [1, 1024, 1, 3328, 1, 1, 3328, 3328] + - [128, 128.0] + - - [704, 64, 1, 32, 704, 704, 32, 32] + - [130, 364.0] + - - [32, 448, 1, 3328, 32, 32, 3328, 3328] + - [180, 1878.0] + - - [448, 1, 1, 1, 448, 448, 1, 1] + - [124, 0.12] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [155, 1206.0] + - - [64, 128, 1, 1, 64, 64, 1, 1] + - [122, 5.0] + - - [256, 128, 1, 1, 256, 256, 1, 1] + - [135, 18.0] + - - [256, 32, 1, 3328, 256, 256, 3328, 3328] + - [142, 1194.0] + - - [1, 1, 1, 3328, 1, 1, 3328, 3328] + - [132, 0.15] + - - [32, 448, 1, 1280, 32, 32, 1280, 1280] + - [180, 1487.0] + - - [32, 448, 1, 32, 32, 32, 32, 32] + - [124, 114.0] + - - [64, 1024, 1, 32, 64, 64, 32, 32] + - [176, 524.0] + - - [128, 1, 1, 1, 128, 128, 1, 1] + - [122, 0.03] + - - [1024, 32, 1, 3328, 1024, 1024, 3328, 3328] + - [155, 2991.0] + - - [448, 1, 1, 1280, 448, 448, 1280, 1280] + - [155, 49.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [167, 474.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [150, 3890.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [188, 2602.0] + - - [256, 256, 1, 32, 256, 256, 32, 32] + - [179, 962.0] + - - [1024, 1, 1, 256, 1024, 1024, 256, 256] + - [163, 45.0] + - - [128, 32, 1, 32, 128, 128, 32, 32] + - [190, 34.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [163, 1188.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [188, 3121.0] + - - [1, 64, 1, 3328, 1, 1, 3328, 3328] + - [128, 9.0] + - - [64, 1024, 1, 1, 64, 64, 1, 1] + - [165, 17.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [125, 4009.0] + - - [32, 704, 1, 3328, 32, 32, 3328, 3328] + - [194, 2764.0] + - - [32, 1024, 1, 3328, 32, 32, 3328, 3328] + - [194, 3086.0] + - - [64, 1, 1, 256, 64, 64, 256, 256] + - [122, 3.0] + - - [1024, 64, 1, 32, 1024, 1024, 32, 32] + - [151, 507.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [143, 4077.0] + - - [32, 1024, 1, 256, 32, 32, 256, 256] + - [142, 1362.0] + - - [64, 1, 1, 1, 64, 64, 1, 1] + - [124, 0.02] + - - [256, 1, 1, 256, 256, 256, 256, 256] + - [122, 11.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [126, 3009.0] + - - [64, 64, 1, 1, 64, 64, 1, 1] + - [122, 1.0] + - - [32, 704, 1, 1280, 32, 32, 1280, 1280] + - [180, 2168.0] + - - [256, 1, 1, 1280, 256, 256, 1280, 1280] + - [167, 29.0] + - - [128, 32, 1, 1280, 128, 128, 1280, 1280] + - [167, 485.0] + - - [128, 256, 1, 1, 128, 128, 1, 1] + - [122, 8.0] + - - [1, 256, 1, 256, 1, 1, 256, 256] + - [176, 12.0] + - - [1, 256, 1, 1, 1, 1, 1, 1] + - [122, 0.06] + - - [1024, 1, 1, 1280, 1024, 1024, 1280, 1280] + - [155, 94.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [136, 1232.0] + - - [1024, 32, 1, 1280, 1024, 1024, 1280, 1280] + - [140, 2413.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [123, 4228.0] + - - [704, 32, 1, 1280, 704, 704, 1280, 1280] + - [180, 2028.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [142, 607.0] + - - [32, 32, 1, 32, 32, 32, 32, 32] + - [140, 9.0] + - - [1024, 32, 1, 32, 1024, 1024, 32, 32] + - [192, 262.0] + - - [128, 64, 1, 32, 128, 128, 32, 32] + - [192, 69.0] + - - [64, 1, 1, 1280, 64, 64, 1280, 1280] + - [128, 7.0] + - - [448, 32, 1, 1280, 448, 448, 1280, 1280] + - [128, 1537.0] + - - [704, 32, 1, 3328, 704, 704, 3328, 3328] + - [128, 2398.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [132, 1033.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [188, 2463.0] + - - [64, 256, 1, 1, 64, 64, 1, 1] + - [122, 4.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [162, 2273.0] + - - [448, 1, 1, 3328, 448, 448, 3328, 3328] + - [128, 62.0] + - - [256, 1, 1, 1, 256, 256, 1, 1] + - [187, 0.14] + - - [32, 1024, 1, 1280, 32, 32, 1280, 1280] + - [188, 2542.0] + - - [1, 256, 1, 3328, 1, 1, 3328, 3328] + - [155, 42.0] + - - [256, 32, 1, 256, 256, 256, 256, 256] + - [128, 699.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [188, 2605.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [169, 1172.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [149, 0.0005882352863190291] + - - [32, 1024, 1, 32, 32, 32, 32, 32] + - [183, 350.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [140, 1366.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [170, 1711.0] + - - [704, 1, 1, 1, 704, 704, 1, 1] + - [184, 0.18] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [126, 3424.0] + - - [448, 32, 1, 32, 448, 448, 32, 32] + - [176, 270.0] + - - [704, 64, 1, 1, 704, 704, 1, 1] + - [126, 29.0] + - - [704, 32, 1, 256, 704, 704, 256, 256] + - [142, 1567.0] + - - [32, 704, 1, 32, 32, 32, 32, 32] + - [130, 419.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [155, 377.0] + - - [448, 32, 1, 3328, 448, 448, 3328, 3328] + - [128, 1986.0] + - - [64, 704, 1, 32, 64, 64, 32, 32] + - [124, 355.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [181, 3547.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [167, 2473.0] + - - [128, 448, 1, 32, 128, 128, 32, 32] + - [122, 1019.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [138, 783.0] + - - [64, 704, 1, 1, 64, 64, 1, 1] + - [133, 23.0] + - - [1, 1024, 1, 1, 1, 1, 1, 1] + - [124, 0.25] + - - [256, 1, 1, 3328, 256, 256, 3328, 3328] + - [132, 39.0] + - - [32, 64, 1, 32, 32, 32, 32, 32] + - [148, 35.0] + - - [256, 256, 1, 1, 256, 256, 1, 1] + - [124, 17.0] + - - [32, 256, 1, 32, 32, 32, 32, 32] + - [176, 67.0] + - - [128, 1, 1, 256, 128, 128, 256, 256] + - [128, 6.0] + - - [32, 64, 1, 3328, 32, 32, 3328, 3328] + - [146, 303.0] + - - [1, 128, 1, 3328, 1, 1, 3328, 3328] + - [128, 19.0] + - - [32, 256, 1, 256, 32, 32, 256, 256] + - [142, 413.0] + - - [1, 448, 1, 1, 1, 1, 1, 1] + - [192, 0.12] + - - [1, 704, 1, 3328, 1, 1, 3328, 3328] + - [180, 88.0] + - - [64, 1, 1, 3328, 64, 64, 3328, 3328] + - [132, 10.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [157, 2756.0] + - - [256, 32, 1, 1280, 256, 256, 1280, 1280] + - [128, 921.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [153, 3903.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [189, 2162.0] + - - [64, 32, 1, 32, 64, 64, 32, 32] + - [173, 34.0] + - - [1, 448, 1, 3328, 1, 1, 3328, 3328] + - [186, 64.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [137, 2076.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [188, 3061.0] + - - [64, 32, 1, 3328, 64, 64, 3328, 3328] + - [167, 305.0] + - - [64, 448, 1, 1, 64, 64, 1, 1] + - [124, 8.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [167, 1789.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [163, 396.0] + - - [64, 448, 1, 32, 64, 64, 32, 32] + - [188, 240.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [194, 1225.0] + - - [32, 64, 1, 1280, 32, 32, 1280, 1280] + - [155, 240.0] + - - [448, 32, 1, 256, 448, 448, 256, 256] + - [128, 670.0] + - - [1024, 32, 1, 256, 1024, 1024, 256, 256] + - [178, 1340.0] + - - [1, 128, 1, 256, 1, 1, 256, 256] + - [122, 6.0] + - - [32, 256, 1, 1280, 32, 32, 1280, 1280] + - [194, 931.0] + - - [32, 128, 1, 3328, 32, 32, 3328, 3328] + - [146, 613.0] + - - [32, 128, 1, 32, 32, 32, 32, 32] + - [165, 74.0] + - - [1, 128, 1, 1, 1, 1, 1, 1] + - [126, 0.04] + - - [128, 64, 1, 1, 128, 128, 1, 1] + - [122, 2.0] + - - [32, 448, 1, 256, 32, 32, 256, 256] + - [158, 1037.0] + - - [1, 704, 1, 256, 1, 1, 256, 256] + - [149, 51.0] + - - [32, 256, 1, 3328, 32, 32, 3328, 3328] + - [167, 1343.0] + - - [256, 32, 1, 32, 256, 256, 32, 32] + - [151, 154.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [180, 2199.0] + - - [1, 704, 1, 1, 1, 1, 1, 1] + - [184, 0.43] + - - [128, 448, 1, 1, 128, 128, 1, 1] + - [165, 36.0] + - - [64, 128, 1, 32, 64, 64, 32, 32] + - [174, 158.0] + - - [704, 1, 1, 1280, 704, 704, 1280, 1280] + - [128, 84.0] + - - [1024, 1, 1, 1, 1024, 1024, 1, 1] + - [130, 1.0] + - - [256, 128, 1, 32, 256, 256, 32, 32] + - [187, 558.0] + - - [448, 128, 1, 1, 448, 448, 1, 1] + - [139, 15.0] + - - [704, 32, 1, 32, 704, 704, 32, 32] + - [163, 179.0] + - - [128, 32, 1, 256, 128, 128, 256, 256] + - [124, 193.0] + - - [64, 32, 1, 1280, 64, 64, 1280, 1280] + - [167, 236.0] + - - [448, 128, 1, 32, 448, 448, 32, 32] + - [149, 448.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [140, 2073.0] + - - [32, 32, 1, 256, 32, 32, 256, 256] + - [176, 77.0] + - - [256, 64, 1, 32, 256, 256, 32, 32] + - [153, 305.0] + - - [1, 1024, 1, 1280, 1, 1, 1280, 1280] + - [128, 124.0] + - - [32, 32, 1, 3328, 32, 32, 3328, 3328] + - [146, 169.0] + - - [1, 256, 1, 1280, 1, 1, 1280, 1280] + - [142, 29.0] + - - [1, 128, 1, 1280, 1, 1, 1280, 1280] + - [128, 14.0] + - - [1, 64, 1, 256, 1, 1, 256, 256] + - [122, 3.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [142, 1796.0] + - - [32, 704, 1, 256, 32, 32, 256, 256] + - [155, 942.0] + - - [1, 64, 1, 1, 1, 1, 1, 1] + - [126, 0.02] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [195, 2892.0] + - - [1, 704, 1, 1280, 1, 1, 1280, 1280] + - [180, 72.0] + - - [128, 128, 1, 32, 128, 128, 32, 32] + - [124, 129.0] + - - [1024, 64, 1, 1, 1024, 1024, 1, 1] + - [195, 34.0] + - - [704, 1, 1, 256, 704, 704, 256, 256] + - [142, 31.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [142, 950.0] + - - [64, 64, 1, 32, 64, 64, 32, 32] + - [122, 33.0] + - - [1, 1, 1, 1280, 1, 1, 1280, 1280] + - [128, 0.11] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [161, 1686.0] + - - [1, 448, 1, 1280, 1, 1, 1280, 1280] + - [128, 49.0] + - - [64, 256, 1, 32, 64, 64, 32, 32] + - [130, 132.0] + - - [32, 128, 1, 1280, 32, 32, 1280, 1280] + - [186, 574.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [155, 2185.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [155, 2943.0] + - - [32, 64, 1, 256, 32, 32, 256, 256] + - [122, 157.0] + - - [128, 256, 1, 32, 128, 128, 32, 32] + - [138, 257.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [167, 1713.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [194, 193.0] + - - [448, 64, 1, 32, 448, 448, 32, 32] + - [146, 229.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [167, 946.0] + - - [1, 1024, 1, 256, 1, 1, 256, 256] + - [128, 43.0] + - - [128, 1, 1, 3328, 128, 128, 3328, 3328] + - [128, 19.0] + - - [128, 128, 1, 1, 128, 128, 1, 1] + - [122, 4.0] + - - [32, 128, 1, 256, 32, 32, 256, 256] + - [142, 195.0] + - - [1, 64, 1, 1280, 1, 1, 1280, 1280] + - [128, 7.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [140, 3411.0] + - - [256, 64, 1, 1, 256, 256, 1, 1] + - [130, 10.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [195, 4018.0] + - - [704, 1, 1, 3328, 704, 704, 3328, 3328] + - [186, 95.0] + - - [128, 32, 1, 3328, 128, 128, 3328, 3328] + - [180, 673.0] + - - [32, 32, 1, 1280, 32, 32, 1280, 1280] + - [194, 150.0] + - - [1, 1, 1, 256, 1, 1, 256, 256] + - [194, 0.09] + - - [1, 448, 1, 256, 1, 1, 256, 256] + - [128, 34.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [142, 2322.0] + - - [64, 32, 1, 256, 64, 64, 256, 256] + - [128, 168.0] + - - [128, 1, 1, 1280, 128, 128, 1280, 1280] + - [186, 15.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [123, 3524.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [128, 2130.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [194, 2190.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [150, 3430.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [180, 602.0] + - - [14, 14, 1, 64, 14, 14, 64, 64] + - [122, 3.0] + - - [15, 14, 1, 64, 15, 15, 64, 64] + - [122, 3.0] + - - [15, 15, 1, 64, 15, 15, 64, 64] + - [122, 3.0] + - - [17, 15, 1, 64, 17, 17, 64, 64] + - [137, 5.0] + - - [17, 17, 1, 64, 17, 17, 64, 64] + - [172, 7.0] + - - [21, 17, 1, 64, 21, 21, 64, 64] + - [196, 9.0] + - - [21, 21, 1, 64, 21, 21, 64, 64] + - [185, 7.0] + - - [24, 24, 1, 64, 24, 24, 64, 64] + - [184, 15.0] + - - [30, 30, 1, 64, 30, 30, 64, 64] + - [151, 13.0] + - - [30, 31, 1, 64, 30, 30, 64, 64] + - [122, 13.0] + - - [31, 31, 1, 64, 31, 31, 64, 64] + - [170, 16.0] + - - [32, 32, 1, 64, 32, 32, 64, 64] + - [140, 16.0] + - - [32, 35, 1, 64, 32, 32, 64, 64] + - [136, 17.0] + - - [34, 24, 1, 64, 34, 34, 64, 64] + - [122, 25.0] + - - [34, 34, 1, 64, 34, 34, 64, 64] + - [149, 17.0] + - - [35, 35, 1, 64, 35, 35, 64, 64] + - [122, 17.0] + - - [27, 27, 1, 64, 27, 27, 64, 64] + - [174, 11.0] + - - [27, 33, 1, 64, 27, 27, 64, 64] + - [138, 13.0] + - - [33, 33, 1, 64, 33, 33, 64, 64] + - [136, 16.0] + - - [2, 4, 1, 1024, 2, 2, 1024, 1024] + - [122, 1.0] + - - [2, 32, 1, 1024, 2, 2, 1024, 1024] + - [180, 7.0] + - - [64, 512, 1, 512, 64, 64, 512, 512] + - [161, 1885.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [142, 365.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [142, 2482.0] + - - [3, 3, 512, 64, 3, 3, 64, 64] + - [122, 52.0] + - - [5, 5, 512, 64, 5, 5, 64, 64] + - [122, 143.0] + - - [5, 5, 960, 64, 5, 5, 64, 64] + - [161, 221.0] + - - [9, 9, 512, 64, 9, 9, 64, 64] + - [142, 417.0] + - - [27, 27, 32768, 128, 27, 27, 128, 128] + - [181, 3493.0] + - - [64, 512, 1, 1024, 64, 64, 1024, 1024] + - [161, 2954.0] + - - [64, 960, 1, 1024, 64, 64, 1024, 1024] + - [140, 3860.0] + - - [14, 14, 10880, 64, 14, 14, 64, 64] + - [128, 2821.0] + - - [15, 14, 10880, 64, 15, 15, 64, 64] + - [155, 2973.0] + - - [15, 15, 7680, 64, 15, 15, 64, 64] + - [128, 2783.0] + - - [15, 15, 10880, 64, 15, 15, 64, 64] + - [194, 2858.0] + - - [17, 15, 7680, 64, 17, 17, 64, 64] + - [151, 2054.0] + - - [17, 17, 7680, 64, 17, 17, 64, 64] + - [137, 1889.0] + - - [21, 17, 6144, 64, 21, 21, 64, 64] + - [175, 2082.0] + - - [21, 21, 6144, 64, 21, 21, 64, 64] + - [195, 2456.0] + - - [24, 24, 4736, 64, 24, 24, 64, 64] + - [195, 2750.0] + - - [30, 30, 2048, 64, 30, 30, 64, 64] + - [194, 3017.0] + - - [30, 31, 2048, 64, 30, 30, 64, 64] + - [178, 3390.0] + - - [31, 31, 2048, 64, 31, 31, 64, 64] + - [153, 3322.0] + - - [34, 24, 4736, 64, 34, 34, 64, 64] + - [181, 2520.0] + - - [27, 27, 1920, 64, 27, 27, 64, 64] + - [181, 2765.0] + - - [27, 33, 1920, 64, 27, 27, 64, 64] + - [150, 2583.0] + - - [2, 8, 1, 1024, 2, 2, 1024, 1024] + - [128, 2.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [189, 3507.0] + - - [2, 10, 1, 1024, 2, 2, 1024, 1024] + - [122, 2.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [167, 909.0] + - - [2, 39, 1, 1024, 2, 2, 1024, 1024] + - [128, 8.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [161, 2383.0] + - - [2, 40, 1, 1024, 2, 2, 1024, 1024] + - [128, 8.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [163, 2467.0] + - - [2, 41, 1, 1024, 2, 2, 1024, 1024] + - [122, 8.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [190, 2505.0] + - - [2, 5, 1, 1024, 2, 2, 1024, 1024] + - [122, 1.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [167, 460.0] + - - [2, 6, 1, 1024, 2, 2, 1024, 1024] + - [122, 1.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [194, 546.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [122, 703.0] + - - [2, 9, 1, 1024, 2, 2, 1024, 1024] + - [122, 2.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [194, 785.0] + - - [4, 4, 32768, 64, 4, 4, 64, 64] + - [188, 317.0] + - - [4, 4, 38400, 64, 4, 4, 64, 64] + - [161, 318.0] + - - [17, 17, 6144, 64, 17, 17, 64, 64] + - [175, 1806.0] + - - [128, 128, 1, 64, 128, 128, 64, 64] + - [136, 265.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [174, 232.0] + - - [2, 1024, 1, 1024, 2, 2, 1024, 1024] + - [142, 193.0] + - - [5, 5, 1, 64, 5, 5, 64, 64] + - [161, 0.37] + - - [33, 33, 1, 32, 33, 33, 32, 32] + - [122, 8.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [194, 1459.0] + - - [2, 4, 1, 2560, 2, 2, 2560, 2560] + - [122, 1.0] + - - [2, 16, 1, 1024, 2, 2, 1024, 1024] + - [122, 3.0] + - - [2, 2, 1, 2048, 2, 2, 2048, 2048] + - [128, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [167, 93.0] + - - [512, 1, 1, 2048, 512, 512, 2048, 2048] + - [142, 66.0] + - - [200, 1, 1, 1024, 200, 200, 1024, 1024] + - [142, 21.0] + - - [960, 1, 1, 2048, 960, 960, 2048, 2048] + - [194, 103.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [141, 3303.0] + - - [864, 1, 1, 256, 864, 864, 256, 256] + - [167, 37.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [162, 3594.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [195, 3650.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [142, 1098.0] + - - [2, 64, 1, 1024, 2, 2, 1024, 1024] + - [128, 13.0] + - - [2, 80, 1, 1024, 2, 2, 1024, 1024] + - [128, 16.0] + - - [2, 82, 1, 1024, 2, 2, 1024, 1024] + - [167, 17.0] + - - [2, 12, 1, 1024, 2, 2, 1024, 1024] + - [194, 3.0] + - - [2, 1, 1, 1024, 2, 2, 1024, 1024] + - [128, 0.2] + - - [24, 24, 6816, 64, 24, 24, 64, 64] + - [156, 3020.0] + - - [256, 128, 1, 3136, 256, 256, 3136, 3136] + - [174, 3553.0] + - - [576, 64, 1, 3136, 576, 576, 3136, 3136] + - [122, 3400.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [180, 1004.0] + - - [768, 12, 1, 768, 768, 768, 768, 768] + - [180, 743.0] + - - [768, 4, 1, 768, 768, 768, 768, 768] + - [128, 245.0] + - - [64, 1024, 1, 1024, 64, 64, 1024, 1024] + - [189, 3577.0] + - - [26, 26, 6272, 64, 26, 26, 64, 64] + - [195, 3113.0] + - - [2, 128, 1, 1024, 2, 2, 1024, 1024] + - [128, 27.0] + - - [2, 96, 1, 1024, 2, 2, 1024, 1024] + - [128, 20.0] + - - [256, 80, 1, 784, 256, 256, 784, 784] + - [194, 1538.0] + - - [256, 12, 1, 3800, 256, 256, 3800, 3800] + - [171, 453.0] + - - [256, 3, 1, 3800, 256, 256, 3800, 3800] + - [146, 115.0] + - - [256, 12, 1, 950, 256, 256, 950, 950] + - [134, 308.0] + - - [256, 3, 1, 950, 256, 256, 950, 950] + - [160, 77.0] + - - [256, 12, 1, 3220, 256, 256, 3220, 3220] + - [171, 443.0] + - - [256, 3, 1, 3220, 256, 256, 3220, 3220] + - [146, 123.0] + - - [256, 12, 1, 3072, 256, 256, 3072, 3072] + - [197, 458.0] + - - [256, 3, 1, 3072, 256, 256, 3072, 3072] + - [132, 112.0] + - - [256, 12, 1, 850, 256, 256, 850, 850] + - [186, 273.0] + - - [256, 3, 1, 850, 256, 256, 850, 850] + - [159, 68.0] + - - [256, 12, 1, 2852, 256, 256, 2852, 2852] + - [171, 432.0] + - - [256, 3, 1, 2852, 256, 256, 2852, 2852] + - [146, 109.0] + - - [256, 12, 1, 805, 256, 256, 805, 805] + - [132, 263.0] + - - [256, 3, 1, 805, 256, 256, 805, 805] + - [159, 67.0] + - - [256, 3, 1, 864, 256, 256, 864, 864] + - [132, 71.0] + - - [256, 3, 1, 768, 256, 256, 768, 768] + - [142, 68.0] + - - [256, 12, 1, 864, 256, 256, 864, 864] + - [171, 282.0] + - - [256, 12, 1, 768, 256, 256, 768, 768] + - [167, 271.0] + - - [256, 12, 1, 2904, 256, 256, 2904, 2904] + - [197, 426.0] + - - [256, 3, 1, 2904, 256, 256, 2904, 2904] + - [132, 107.0] + - - [256, 3, 1, 713, 256, 256, 713, 713] + - [132, 61.0] + - - [256, 12, 1, 888, 256, 256, 888, 888] + - [197, 274.0] + - - [256, 3, 1, 888, 256, 256, 888, 888] + - [186, 70.0] + - - [256, 12, 1, 713, 256, 256, 713, 713] + - [186, 247.0] + - - [256, 3, 1, 660, 256, 256, 660, 660] + - [132, 59.0] + - - [256, 3, 1, 672, 256, 256, 672, 672] + - [196, 75.0] + - - [256, 12, 1, 660, 256, 256, 660, 660] + - [197, 240.0] + - - [256, 3, 1, 726, 256, 256, 726, 726] + - [132, 62.0] + - - [256, 12, 1, 672, 256, 256, 672, 672] + - [146, 249.0] + - - [256, 3, 1, 247, 256, 256, 247, 247] + - [194, 31.0] + - - [256, 12, 1, 726, 256, 256, 726, 726] + - [186, 246.0] + - - [256, 3, 1, 216, 256, 256, 216, 216] + - [128, 28.0] + - - [256, 3, 1, 3400, 256, 256, 3400, 3400] + - [197, 113.0] + - - [256, 3, 1, 221, 256, 256, 221, 221] + - [132, 29.0] + - - [256, 12, 1, 3552, 256, 256, 3552, 3552] + - [197, 451.0] + - - [256, 3, 1, 3456, 256, 256, 3456, 3456] + - [197, 116.0] + - - [256, 3, 1, 204, 256, 256, 204, 204] + - [194, 28.0] + - - [256, 12, 1, 3400, 256, 256, 3400, 3400] + - [146, 448.0] + - - [256, 12, 1, 3456, 256, 256, 3456, 3456] + - [159, 459.0] + - - [256, 12, 1, 221, 256, 256, 221, 221] + - [180, 114.0] + - - [256, 3, 1, 3552, 256, 256, 3552, 3552] + - [146, 115.0] + - - [256, 3, 1, 228, 256, 256, 228, 228] + - [186, 31.0] + - - [256, 3, 1, 234, 256, 256, 234, 234] + - [142, 30.0] + - - [256, 12, 1, 234, 256, 256, 234, 234] + - [122, 174.0] + - - [256, 12, 1, 228, 256, 256, 228, 228] + - [128, 118.0] + - - [256, 3, 1, 252, 256, 256, 252, 252] + - [130, 32.0] + - - [256, 12, 1, 252, 256, 256, 252, 252] + - [184, 127.0] + - - [256, 12, 1, 247, 256, 256, 247, 247] + - [155, 124.0] + - - [128, 256, 1, 1444, 128, 128, 1444, 1444] + - [155, 2701.0] + - - [256, 128, 1, 25, 256, 256, 25, 25] + - [124, 194.0] + - - [256, 128, 1, 9, 256, 256, 9, 9] + - [153, 72.0] + - - [256, 256, 1, 1444, 256, 256, 1444, 1444] + - [174, 3500.0] + - - [512, 128, 1, 100, 512, 512, 100, 100] + - [136, 1187.0] + - - [64, 128, 1, 1444, 64, 64, 1444, 1444] + - [159, 823.0] + - - [81, 1024, 1, 1024, 81, 81, 1024, 1024] + - [162, 3208.0] + - - [81, 1000, 1, 1024, 81, 81, 1024, 1024] + - [189, 3125.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [140, 1408.0] + - - [2, 8, 1, 2048, 2, 2, 2048, 2048] + - [122, 2.0] + - - [2, 20, 1, 1024, 2, 2, 1024, 1024] + - [122, 4.0] + - - [2, 2, 1, 2560, 2, 2, 2560, 2560] + - [128, 1.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH_GB.yaml new file mode 100644 index 000000000..2f6951619 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HBH_GB.yaml @@ -0,0 +1,57466 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 1 + LVCB: 1 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 1 + LVCB: 1 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 64 + LVCA: 1 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x32_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x32_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 16 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x8_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 8 + LVCA: 4 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 8 + MacroTileA: 128 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT128x8x16_SN_SU32_SUM3_TT4_2_WG32_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x32_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x16_SN_SU0_SUM0_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x8x16_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x32x8_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x32x16_SN_SU0_SUM0_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x16_SN_SU0_SUM0_TT2_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT32x32x8_SN_SU0_SUM0_TT2_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 2 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 4 + LoopTail: true + LoopUnroll: 4 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_HBH_GB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 +- [2, 3, 0, 1] +- - - [2368, 1024, 1, 1, 2368, 2368, 1, 1] + - [34, 324.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 20505.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 19815.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [24, 16927.0] + - - [5056, 4288, 1, 32, 5056, 5056, 32, 32] + - [17, 14913.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [22, 17794.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [7, 20550.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 19388.0] + - - [448, 3584, 1, 32, 448, 448, 32, 32] + - [34, 6328.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 21108.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 21463.0] + - - [2368, 1408, 1, 32, 2368, 2368, 32, 32] + - [41, 7765.0] + - - [1024, 2944, 1, 1, 1024, 1024, 1, 1] + - [21, 306.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [32, 20916.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [30, 20065.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [7, 21894.0] + - - [3584, 1408, 1, 32, 3584, 3584, 32, 32] + - [2, 8307.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [6, 15527.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [5, 17368.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [24, 19853.0] + - - [3584, 4288, 1, 32, 3584, 3584, 32, 32] + - [34, 14948.0] + - - [3584, 3584, 1, 1, 3584, 3584, 1, 1] + - [26, 535.0] + - - [1408, 2368, 1, 1, 1408, 1408, 1, 1] + - [41, 426.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 21272.0] + - - [4288, 5056, 1, 1, 4288, 4288, 1, 1] + - [21, 641.0] + - - [5056, 4288, 1, 1, 5056, 5056, 1, 1] + - [34, 646.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [46, 20321.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [7, 20140.0] + - - [1024, 5056, 1, 1, 1024, 1024, 1, 1] + - [26, 375.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [12, 16986.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [14, 18967.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [12, 17123.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 21758.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [24, 21606.0] + - - [2368, 3584, 1, 32, 2368, 2368, 32, 32] + - [10, 10726.0] + - - [2944, 2368, 1, 1, 2944, 2944, 1, 1] + - [29, 329.0] + - - [704, 4288, 1, 1, 704, 704, 1, 1] + - [45, 306.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 20627.0] + - - [1024, 3584, 1, 1, 1024, 1024, 1, 1] + - [26, 328.0] + - - [256, 5056, 1, 32, 256, 256, 32, 32] + - [0, 8218.0] + - - [2368, 5056, 1, 32, 2368, 2368, 32, 32] + - [34, 11759.0] + - - [6784, 1856, 1, 32, 6784, 6784, 32, 32] + - [17, 10854.0] + - - [5056, 704, 1, 1, 5056, 5056, 1, 1] + - [26, 311.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [14, 19834.0] + - - [5056, 704, 1, 32, 5056, 5056, 32, 32] + - [17, 8940.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [24, 20422.0] + - - [6784, 4288, 1, 32, 6784, 6784, 32, 32] + - [17, 15116.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [24, 20624.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [5, 20481.0] + - - [4288, 1856, 1, 1, 4288, 4288, 1, 1] + - [0, 410.0] + - - [1856, 2944, 1, 1, 1856, 1856, 1, 1] + - [8, 394.0] + - - [1856, 2368, 1, 32, 1856, 1856, 32, 32] + - [41, 9314.0] + - - [4288, 1856, 1, 32, 4288, 4288, 32, 32] + - [34, 12150.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [38, 19282.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [14, 20090.0] + - - [5056, 6784, 1, 1, 5056, 5056, 1, 1] + - [21, 693.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [12, 17431.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [48, 18645.0] + - - [704, 2368, 1, 1, 704, 704, 1, 1] + - [45, 194.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 21459.0] + - - [3584, 2368, 1, 1, 3584, 3584, 1, 1] + - [26, 389.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 16889.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [30, 18561.0] + - - [2368, 4288, 1, 32, 2368, 2368, 32, 32] + - [34, 12925.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [32, 18645.0] + - - [3584, 6784, 1, 32, 3584, 3584, 32, 32] + - [41, 15334.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 21573.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [14, 19766.0] + - - [1408, 3584, 1, 1, 1408, 1408, 1, 1] + - [41, 353.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [46, 19709.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [46, 19452.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [7, 21013.0] + - - [3584, 704, 1, 1, 3584, 3584, 1, 1] + - [26, 394.0] + - - [448, 5056, 1, 1, 448, 448, 1, 1] + - [21, 216.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [24, 21314.0] + - - [3584, 704, 1, 32, 3584, 3584, 32, 32] + - [34, 11030.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [12, 17726.0] + - - [6784, 4288, 1, 1, 6784, 6784, 1, 1] + - [30, 672.0] + - - [3584, 6784, 1, 1, 3584, 3584, 1, 1] + - [35, 575.0] + - - [1408, 2368, 1, 32, 1408, 1408, 32, 32] + - [41, 10142.0] + - - [448, 5056, 1, 32, 448, 448, 32, 32] + - [19, 6437.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [24, 21351.0] + - - [6784, 1408, 1, 1, 6784, 6784, 1, 1] + - [34, 386.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [14, 19951.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 20101.0] + - - [5056, 5888, 1, 1, 5056, 5056, 1, 1] + - [36, 769.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [5, 17007.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [14, 20276.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [22, 20722.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [12, 19066.0] + - - [3584, 3584, 1, 32, 3584, 3584, 32, 32] + - [25, 10755.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 20835.0] + - - [1408, 5056, 1, 1, 1408, 1408, 1, 1] + - [41, 377.0] + - - [2368, 6784, 1, 1, 2368, 2368, 1, 1] + - [23, 409.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 21620.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [22, 15838.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 20932.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [12, 18297.0] + - - [5888, 256, 1, 1, 5888, 5888, 1, 1] + - [19, 331.0] + - - [5056, 6784, 1, 32, 5056, 5056, 32, 32] + - [26, 14623.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [7, 17405.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [30, 17954.0] + - - [5888, 1024, 1, 1, 5888, 5888, 1, 1] + - [13, 406.0] + - - [5888, 448, 1, 32, 5888, 5888, 32, 32] + - [17, 8116.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [38, 20633.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [46, 19805.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [7, 17586.0] + - - [1408, 4288, 1, 1, 1408, 1408, 1, 1] + - [0, 403.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [46, 18229.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [7, 19529.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [14, 19913.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [5, 16596.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [32, 14283.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 21716.0] + - - [1408, 1024, 1, 1, 1408, 1408, 1, 1] + - [45, 162.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [6, 10612.0] + - - [5056, 3584, 1, 1, 5056, 5056, 1, 1] + - [23, 576.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 21720.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [12, 20129.0] + - - [5056, 3584, 1, 32, 5056, 5056, 32, 32] + - [8, 15308.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 21651.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [38, 16836.0] + - - [5888, 4288, 1, 1, 5888, 5888, 1, 1] + - [34, 628.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [32, 16448.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [22, 19490.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [30, 18891.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [22, 15196.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 21617.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21254.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [5, 18795.0] + - - [2944, 1856, 1, 1, 2944, 2944, 1, 1] + - [34, 353.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 21545.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 21828.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 20147.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [12, 18006.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 19454.0] + - - [1024, 2368, 1, 1, 1024, 1024, 1, 1] + - [26, 315.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [7, 21134.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [38, 18812.0] + - - [448, 3584, 1, 1, 448, 448, 1, 1] + - [21, 244.0] + - - [2368, 2944, 1, 32, 2368, 2368, 32, 32] + - [0, 10405.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [30, 16127.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [22, 16526.0] + - - [704, 6784, 1, 32, 704, 704, 32, 32] + - [43, 9481.0] + - - [1024, 4288, 1, 1, 1024, 1024, 1, 1] + - [26, 335.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [24, 21333.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [38, 18765.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [40, 19335.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [22, 19583.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [12, 19769.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [24, 21222.0] + - - [5888, 6784, 1, 32, 5888, 5888, 32, 32] + - [39, 12369.0] + - - [6784, 6784, 1, 32, 6784, 6784, 32, 32] + - [37, 14976.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [38, 16238.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [7, 20465.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [46, 20450.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 21739.0] + - - [6784, 448, 1, 1, 6784, 6784, 1, 1] + - [4, 292.0] + - - [6784, 1856, 1, 1, 6784, 6784, 1, 1] + - [37, 422.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 20269.0] + - - [6784, 448, 1, 32, 6784, 6784, 32, 32] + - [34, 8955.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 17498.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [40, 15954.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [24, 21117.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 21739.0] + - - [5888, 704, 1, 1, 5888, 5888, 1, 1] + - [26, 314.0] + - - [3584, 1856, 1, 1, 3584, 3584, 1, 1] + - [17, 322.0] + - - [5056, 2944, 1, 32, 5056, 5056, 32, 32] + - [4, 11715.0] + - - [4288, 6784, 1, 1, 4288, 4288, 1, 1] + - [19, 831.0] + - - [1024, 6784, 1, 1, 1024, 1024, 1, 1] + - [22, 352.0] + - - [2368, 5888, 1, 32, 2368, 2368, 32, 32] + - [41, 12654.0] + - - [3584, 4288, 1, 1, 3584, 3584, 1, 1] + - [45, 379.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 20586.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [24, 21276.0] + - - [5056, 1024, 1, 1, 5056, 5056, 1, 1] + - [26, 584.0] + - - [4288, 2368, 1, 32, 4288, 4288, 32, 32] + - [17, 15141.0] + - - [704, 3584, 1, 1, 704, 704, 1, 1] + - [34, 389.0] + - - [6784, 704, 1, 32, 6784, 6784, 32, 32] + - [34, 13152.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [22, 17385.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 20364.0] + - - [3584, 5056, 1, 32, 3584, 3584, 32, 32] + - [26, 12106.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 20452.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 21492.0] + - - [2368, 5056, 1, 1, 2368, 2368, 1, 1] + - [11, 370.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [30, 19611.0] + - - [2368, 1024, 1, 32, 2368, 2368, 32, 32] + - [0, 8398.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [14, 18066.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [14, 21616.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [30, 19542.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [14, 20709.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [24, 20563.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [30, 20353.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [24, 21660.0] + - - [5888, 2944, 1, 1, 5888, 5888, 1, 1] + - [29, 417.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [7, 20312.0] + - - [704, 2944, 1, 32, 704, 704, 32, 32] + - [19, 8010.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [24, 20841.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [12, 20032.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [22, 17411.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [32, 20260.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [29, 14504.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [12, 16345.0] + - - [1024, 1408, 1, 1, 1024, 1024, 1, 1] + - [41, 196.0] + - - [256, 5888, 1, 1, 256, 256, 1, 1] + - [28, 277.0] + - - [2944, 2944, 1, 1, 2944, 2944, 1, 1] + - [0, 475.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [14, 21041.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [38, 16470.0] + - - [2944, 2944, 1, 32, 2944, 2944, 32, 32] + - [34, 10750.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 20716.0] + - - [6784, 1408, 1, 32, 6784, 6784, 32, 32] + - [26, 12746.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [14, 19265.0] + - - [4288, 3584, 1, 32, 4288, 4288, 32, 32] + - [34, 13327.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [46, 16927.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [24, 18252.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [30, 13451.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 19056.0] + - - [2944, 5888, 1, 32, 2944, 2944, 32, 32] + - [41, 15803.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [38, 20182.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [30, 19237.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [14, 19947.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [12, 15471.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [12, 17286.0] + - - [2368, 704, 1, 1, 2368, 2368, 1, 1] + - [34, 190.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [46, 17672.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [14, 19794.0] + - - [448, 4288, 1, 32, 448, 448, 32, 32] + - [41, 5799.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [12, 16286.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [30, 18327.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21407.0] + - - [1856, 3584, 1, 32, 1856, 1856, 32, 32] + - [0, 11262.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 20063.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [24, 20147.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [5, 19265.0] + - - [2368, 3584, 1, 1, 2368, 2368, 1, 1] + - [47, 456.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [38, 19173.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [30, 20119.0] + - - [5888, 3584, 1, 1, 5888, 5888, 1, 1] + - [26, 833.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 21749.0] + - - [6784, 5056, 1, 1, 6784, 6784, 1, 1] + - [36, 791.0] + - - [5888, 3584, 1, 32, 5888, 5888, 32, 32] + - [0, 15595.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 19555.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [30, 16815.0] + - - [6784, 5888, 1, 32, 6784, 6784, 32, 32] + - [13, 11667.0] + - - [2368, 6784, 1, 32, 2368, 2368, 32, 32] + - [8, 13154.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [5, 20533.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 20100.0] + - - [2944, 3584, 1, 1, 2944, 2944, 1, 1] + - [27, 454.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 19755.0] + - - [3584, 1024, 1, 1, 3584, 3584, 1, 1] + - [27, 344.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 19523.0] + - - [2944, 3584, 1, 32, 2944, 2944, 32, 32] + - [34, 13087.0] + - - [5888, 256, 1, 32, 5888, 5888, 32, 32] + - [2, 5648.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [14, 20727.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [7, 20155.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [30, 16365.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [40, 20391.0] + - - [2368, 1408, 1, 1, 2368, 2368, 1, 1] + - [43, 276.0] + - - [1024, 1856, 1, 32, 1024, 1024, 32, 32] + - [8, 6470.0] + - - [5888, 2368, 1, 1, 5888, 5888, 1, 1] + - [17, 584.0] + - - [2368, 2368, 1, 1, 2368, 2368, 1, 1] + - [41, 391.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [30, 14988.0] + - - [5888, 2368, 1, 32, 5888, 5888, 32, 32] + - [26, 15406.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [30, 20536.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [14, 20772.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [40, 18958.0] + - - [1856, 1856, 1, 32, 1856, 1856, 32, 32] + - [17, 10341.0] + - - [4288, 2944, 1, 32, 4288, 4288, 32, 32] + - [34, 15442.0] + - - [256, 5056, 1, 1, 256, 256, 1, 1] + - [4, 360.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [40, 20383.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [30, 16676.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [22, 21040.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 17606.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [46, 17427.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [12, 20627.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [22, 15156.0] + - - [6784, 256, 1, 32, 6784, 6784, 32, 32] + - [17, 8472.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 20751.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [24, 20636.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [12, 17607.0] + - - [5888, 1856, 1, 32, 5888, 5888, 32, 32] + - [34, 13216.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [24, 19426.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [22, 15084.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [38, 20801.0] + - - [1856, 1408, 1, 32, 1856, 1856, 32, 32] + - [34, 8516.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [12, 21384.0] + - - [448, 5888, 1, 1, 448, 448, 1, 1] + - [41, 262.0] + - - [3584, 1408, 1, 1, 3584, 3584, 1, 1] + - [26, 371.0] + - - [448, 5888, 1, 32, 448, 448, 32, 32] + - [26, 7845.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [7, 18470.0] + - - [1856, 6784, 1, 1, 1856, 1856, 1, 1] + - [22, 482.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [5, 15404.0] + - - [1856, 6784, 1, 32, 1856, 1856, 32, 32] + - [34, 14127.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 20504.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 21766.0] + - - [1408, 6784, 1, 32, 1408, 1408, 32, 32] + - [0, 9552.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [40, 21710.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [13, 15672.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [38, 19865.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [12, 17836.0] + - - [1856, 2368, 1, 1, 1856, 1856, 1, 1] + - [26, 318.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [14, 20792.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [48, 20074.0] + - - [5056, 5056, 1, 32, 5056, 5056, 32, 32] + - [41, 16192.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [7, 17106.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [12, 14922.0] + - - [4288, 5888, 1, 1, 4288, 4288, 1, 1] + - [8, 801.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [14, 20210.0] + - - [2368, 4288, 1, 1, 2368, 2368, 1, 1] + - [31, 386.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [22, 19091.0] + - - [4288, 5888, 1, 32, 4288, 4288, 32, 32] + - [34, 16309.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [14, 21649.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [24, 19174.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [7, 17243.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [31, 15436.0] + - - [1024, 5888, 1, 32, 1024, 1024, 32, 32] + - [17, 10486.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [12, 20303.0] + - - [5056, 2368, 1, 32, 5056, 5056, 32, 32] + - [8, 11666.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [30, 18741.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 21378.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [38, 18462.0] + - - [704, 5056, 1, 32, 704, 704, 32, 32] + - [8, 8450.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 21300.0] + - - [4288, 448, 1, 1, 4288, 4288, 1, 1] + - [4, 299.0] + - - [5888, 5888, 1, 1, 5888, 5888, 1, 1] + - [5, 631.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 17857.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [24, 19731.0] + - - [2368, 2944, 1, 1, 2368, 2368, 1, 1] + - [41, 423.0] + - - [5056, 256, 1, 32, 5056, 5056, 32, 32] + - [6, 4360.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 20838.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [30, 20720.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [38, 16478.0] + - - [4288, 4288, 1, 32, 4288, 4288, 32, 32] + - [41, 14828.0] + - - [5888, 448, 1, 1, 5888, 5888, 1, 1] + - [0, 374.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [24, 21355.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [12, 20117.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 22099.0] + - - [5888, 1408, 1, 32, 5888, 5888, 32, 32] + - [43, 8951.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [7, 18436.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [46, 20595.0] + - - [2944, 1408, 1, 1, 2944, 2944, 1, 1] + - [43, 339.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [32, 19440.0] + - - [5056, 1408, 1, 32, 5056, 5056, 32, 32] + - [26, 11470.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [12, 19484.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 20074.0] + - - [704, 2368, 1, 32, 704, 704, 32, 32] + - [28, 4746.0] + - - [704, 6784, 1, 1, 704, 704, 1, 1] + - [0, 309.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [22, 19944.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [7, 20256.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [22, 16958.0] + - - [6784, 2944, 1, 32, 6784, 6784, 32, 32] + - [26, 15812.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [38, 20060.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [30, 16151.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [30, 16638.0] + - - [2944, 5056, 1, 32, 2944, 2944, 32, 32] + - [17, 15637.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [24, 17579.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [30, 20110.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 19405.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [48, 20411.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [30, 14788.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 17851.0] + - - [4288, 704, 1, 1, 4288, 4288, 1, 1] + - [0, 285.0] + - - [1856, 1024, 1, 1, 1856, 1856, 1, 1] + - [41, 228.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 20462.0] + - - [4288, 704, 1, 32, 4288, 4288, 32, 32] + - [17, 7828.0] + - - [1856, 1024, 1, 32, 1856, 1856, 32, 32] + - [2, 6070.0] + - - [2944, 6784, 1, 1, 2944, 2944, 1, 1] + - [34, 871.0] + - - [6784, 2368, 1, 32, 6784, 6784, 32, 32] + - [26, 16278.0] + - - [5888, 5056, 1, 1, 5888, 5888, 1, 1] + - [17, 864.0] + - - [704, 5888, 1, 1, 704, 704, 1, 1] + - [23, 318.0] + - - [6784, 6784, 1, 1, 6784, 6784, 1, 1] + - [43, 950.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [7, 17282.0] + - - [704, 5888, 1, 32, 704, 704, 32, 32] + - [19, 8396.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [14, 19181.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 20148.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [30, 17382.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [30, 20384.0] + - - [1408, 1408, 1, 32, 1408, 1408, 32, 32] + - [41, 9248.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [46, 17795.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [38, 17626.0] + - - [4288, 2944, 1, 1, 4288, 4288, 1, 1] + - [2, 501.0] + - - [6784, 5056, 1, 32, 6784, 6784, 32, 32] + - [34, 16630.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [32, 21178.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [30, 18160.0] + - - [2368, 5888, 1, 1, 2368, 2368, 1, 1] + - [50, 402.0] + - - [1408, 1856, 1, 32, 1408, 1408, 32, 32] + - [41, 8896.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [38, 19396.0] + - - [1024, 2368, 1, 32, 1024, 1024, 32, 32] + - [4, 7080.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [30, 19039.0] + - - [3584, 5888, 1, 32, 3584, 3584, 32, 32] + - [34, 14423.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [32, 21797.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [12, 17235.0] + - - [3584, 1024, 1, 32, 3584, 3584, 32, 32] + - [17, 9291.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [22, 19810.0] + - - [5888, 5888, 1, 32, 5888, 5888, 32, 32] + - [45, 12267.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 18096.0] + - - [4288, 1024, 1, 1, 4288, 4288, 1, 1] + - [4, 321.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [12, 17430.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [32, 21184.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 21247.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 17851.0] + - - [1408, 5056, 1, 32, 1408, 1408, 32, 32] + - [41, 11682.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [12, 20310.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [12, 17991.0] + - - [1856, 5056, 1, 1, 1856, 1856, 1, 1] + - [26, 413.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [30, 20156.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [38, 15743.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [12, 15988.0] + - - [1856, 5056, 1, 32, 1856, 1856, 32, 32] + - [34, 11954.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [14, 20209.0] + - - [1024, 2944, 1, 32, 1024, 1024, 32, 32] + - [26, 7986.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [22, 20043.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 19009.0] + - - [1856, 3584, 1, 1, 1856, 1856, 1, 1] + - [41, 416.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [24, 21720.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [22, 20306.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 18782.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [46, 17204.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [24, 20528.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [30, 19139.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [5, 17201.0] + - - [4288, 1024, 1, 32, 4288, 4288, 32, 32] + - [26, 9418.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [38, 19850.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [30, 18720.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 20356.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [13, 12953.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [22, 20042.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [12, 18959.0] + - - [5888, 704, 1, 32, 5888, 5888, 32, 32] + - [8, 9943.0] + - - [2944, 704, 1, 1, 2944, 2944, 1, 1] + - [0, 250.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [22, 19543.0] + - - [1856, 1856, 1, 1, 1856, 1856, 1, 1] + - [8, 340.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 18173.0] + - - [2368, 1856, 1, 32, 2368, 2368, 32, 32] + - [0, 11092.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 18961.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [5, 16215.0] + - - [256, 6784, 1, 1, 256, 256, 1, 1] + - [0, 204.0] + - - [1024, 3584, 1, 32, 1024, 1024, 32, 32] + - [34, 8610.0] + - - [256, 6784, 1, 32, 256, 256, 32, 32] + - [0, 6134.0] + - - [2944, 1408, 1, 32, 2944, 2944, 32, 32] + - [34, 9516.0] + - - [4288, 3584, 1, 1, 4288, 4288, 1, 1] + - [21, 580.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [24, 17675.0] + - - [6784, 3584, 1, 32, 6784, 6784, 32, 32] + - [8, 15437.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [30, 18705.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [5, 18271.0] + - - [2944, 5888, 1, 1, 2944, 2944, 1, 1] + - [10, 421.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [46, 18596.0] + - - [5888, 1024, 1, 32, 5888, 5888, 32, 32] + - [41, 12432.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [30, 20687.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [22, 20644.0] + - - [2944, 5056, 1, 1, 2944, 2944, 1, 1] + - [34, 524.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 22002.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 20728.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [12, 18593.0] + - - [6784, 2944, 1, 1, 6784, 6784, 1, 1] + - [22, 530.0] + - - [2944, 1024, 1, 32, 2944, 2944, 32, 32] + - [0, 11114.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [7, 19103.0] + - - [5056, 5056, 1, 1, 5056, 5056, 1, 1] + - [21, 813.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [30, 18952.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [7, 21327.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [7, 17830.0] + - - [3584, 2368, 1, 32, 3584, 3584, 32, 32] + - [17, 12071.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 21839.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [38, 19439.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [30, 17138.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 20128.0] + - - [2944, 1024, 1, 1, 2944, 2944, 1, 1] + - [2, 322.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [46, 15731.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [14, 20358.0] + - - [4288, 2368, 1, 1, 4288, 4288, 1, 1] + - [34, 527.0] + - - [1408, 3584, 1, 32, 1408, 1408, 32, 32] + - [34, 10391.0] + - - [2944, 4288, 1, 32, 2944, 2944, 32, 32] + - [34, 13722.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [46, 20573.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [7, 21438.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 21478.0] + - - [3584, 2944, 1, 1, 3584, 3584, 1, 1] + - [13, 336.0] + - - [1024, 1856, 1, 1, 1024, 1024, 1, 1] + - [26, 321.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [7, 19401.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 20000.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [30, 17946.0] + - - [1856, 1408, 1, 1, 1856, 1856, 1, 1] + - [0, 279.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [7, 21254.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 18250.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [30, 15155.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 19819.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [24, 20767.0] + - - [1024, 5056, 1, 32, 1024, 1024, 32, 32] + - [4, 9769.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [24, 16620.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [7, 19431.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [30, 13783.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 20709.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 19659.0] + - - [3584, 5888, 1, 1, 3584, 3584, 1, 1] + - [34, 494.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [14, 20210.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [24, 18945.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [22, 20011.0] + - - [5888, 1856, 1, 1, 5888, 5888, 1, 1] + - [11, 382.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [12, 19571.0] + - - [1408, 4288, 1, 32, 1408, 1408, 32, 32] + - [26, 10523.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [40, 19005.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [46, 18741.0] + - - [5056, 1856, 1, 1, 5056, 5056, 1, 1] + - [34, 427.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [38, 19683.0] + - - [1408, 1024, 1, 32, 1408, 1408, 32, 32] + - [19, 8389.0] + - - [5056, 1856, 1, 32, 5056, 5056, 32, 32] + - [4, 11392.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [12, 19924.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [38, 20108.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [7, 18871.0] + - - [5888, 6784, 1, 1, 5888, 5888, 1, 1] + - [39, 750.0] + - - [5888, 4288, 1, 32, 5888, 5888, 32, 32] + - [26, 14290.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [12, 18771.0] + - - [6784, 256, 1, 1, 6784, 6784, 1, 1] + - [10, 220.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 17187.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [46, 16776.0] + - - [2944, 1856, 1, 32, 2944, 2944, 32, 32] + - [26, 10688.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [30, 16452.0] + - - [2368, 1856, 1, 1, 2368, 2368, 1, 1] + - [11, 439.0] + - - [4288, 1408, 1, 1, 4288, 4288, 1, 1] + - [26, 398.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 20094.0] + - - [1408, 2944, 1, 1, 1408, 1408, 1, 1] + - [45, 367.0] + - - [4288, 1408, 1, 32, 4288, 4288, 32, 32] + - [8, 10052.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [38, 20512.0] + - - [1408, 2944, 1, 32, 1408, 1408, 32, 32] + - [34, 9668.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [30, 20863.0] + - - [6784, 5888, 1, 1, 6784, 6784, 1, 1] + - [35, 943.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [24, 21841.0] + - - [1024, 4288, 1, 32, 1024, 1024, 32, 32] + - [8, 8394.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [22, 20514.0] + - - [5056, 2368, 1, 1, 5056, 5056, 1, 1] + - [34, 512.0] + - - [5056, 448, 1, 1, 5056, 5056, 1, 1] + - [26, 332.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [24, 19197.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [22, 19500.0] + - - [5056, 448, 1, 32, 5056, 5056, 32, 32] + - [26, 7965.0] + - - [3584, 2944, 1, 32, 3584, 3584, 32, 32] + - [17, 10281.0] + - - [3584, 1856, 1, 32, 3584, 3584, 32, 32] + - [0, 9641.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [12, 20079.0] + - - [6784, 2368, 1, 1, 6784, 6784, 1, 1] + - [0, 395.0] + - - [704, 5056, 1, 1, 704, 704, 1, 1] + - [34, 322.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [12, 18922.0] + - - [5888, 1408, 1, 1, 5888, 5888, 1, 1] + - [19, 455.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 19461.0] + - - [1856, 4288, 1, 1, 1856, 1856, 1, 1] + - [0, 491.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [38, 19848.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [5, 19030.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [38, 17038.0] + - - [1856, 4288, 1, 32, 1856, 1856, 32, 32] + - [17, 10585.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [32, 20044.0] + - - [4288, 6784, 1, 32, 4288, 4288, 32, 32] + - [26, 15494.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [46, 18205.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [5, 17293.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [40, 16957.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [46, 19701.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [40, 18635.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [48, 20537.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [14, 20767.0] + - - [2944, 6784, 1, 32, 2944, 2944, 32, 32] + - [41, 14939.0] + - - [5056, 256, 1, 1, 5056, 5056, 1, 1] + - [41, 285.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 20460.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [46, 18231.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [22, 14234.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [12, 18410.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [22, 19681.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [21, 13459.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [22, 18800.0] + - - [1024, 1408, 1, 32, 1024, 1024, 32, 32] + - [50, 6373.0] + - - [6784, 704, 1, 1, 6784, 6784, 1, 1] + - [34, 393.0] + - - [704, 3584, 1, 32, 704, 704, 32, 32] + - [47, 8026.0] + - - [4288, 4288, 1, 1, 4288, 4288, 1, 1] + - [35, 672.0] + - - [5056, 2944, 1, 1, 5056, 5056, 1, 1] + - [34, 626.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 21662.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [22, 20343.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [12, 18702.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [24, 20355.0] + - - [3584, 448, 1, 1, 3584, 3584, 1, 1] + - [29, 302.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [24, 19418.0] + - - [5888, 5056, 1, 32, 5888, 5888, 32, 32] + - [17, 14960.0] + - - [704, 2944, 1, 1, 704, 704, 1, 1] + - [9, 260.0] + - - [3584, 448, 1, 32, 3584, 3584, 32, 32] + - [4, 7019.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 15763.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [40, 21456.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [46, 18009.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [48, 21030.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [7, 21898.0] + - - [1408, 1408, 1, 1, 1408, 1408, 1, 1] + - [10, 324.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [22, 19086.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [7, 20005.0] + - - [448, 6784, 1, 1, 448, 448, 1, 1] + - [27, 252.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [12, 14689.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [22, 15069.0] + - - [448, 6784, 1, 32, 448, 448, 32, 32] + - [41, 8531.0] + - - [1408, 1856, 1, 1, 1408, 1408, 1, 1] + - [11, 417.0] + - - [4288, 448, 1, 32, 4288, 4288, 32, 32] + - [17, 5980.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [46, 16289.0] + - - [2944, 704, 1, 32, 2944, 2944, 32, 32] + - [17, 6810.0] + - - [448, 4288, 1, 1, 448, 448, 1, 1] + - [43, 195.0] + - - [3584, 5056, 1, 1, 3584, 3584, 1, 1] + - [29, 798.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [24, 20299.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [46, 16961.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [14, 19576.0] + - - [2368, 2368, 1, 32, 2368, 2368, 32, 32] + - [45, 9892.0] + - - [5888, 2944, 1, 32, 5888, 5888, 32, 32] + - [9, 15794.0] + - - [1856, 2944, 1, 32, 1856, 1856, 32, 32] + - [0, 10358.0] + - - [5056, 1408, 1, 1, 5056, 5056, 1, 1] + - [41, 417.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 20155.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [38, 13555.0] + - - [6784, 1024, 1, 1, 6784, 6784, 1, 1] + - [42, 487.0] + - - [6784, 1024, 1, 32, 6784, 6784, 32, 32] + - [43, 12674.0] + - - [6784, 3584, 1, 1, 6784, 6784, 1, 1] + - [34, 859.0] + - - [2944, 2368, 1, 32, 2944, 2944, 32, 32] + - [26, 11571.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [7, 21521.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [38, 19628.0] + - - [5056, 1024, 1, 32, 5056, 5056, 32, 32] + - [41, 9655.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [7, 20663.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [14, 20332.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [12, 17367.0] + - - [1856, 5888, 1, 1, 1856, 1856, 1, 1] + - [4, 453.0] + - - [256, 5888, 1, 32, 256, 256, 32, 32] + - [8, 5688.0] + - - [4288, 5056, 1, 32, 4288, 4288, 32, 32] + - [17, 15949.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 19736.0] + - - [1856, 5888, 1, 32, 1856, 1856, 32, 32] + - [27, 10316.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [7, 20562.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [24, 17261.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 20018.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [5, 20136.0] + - - [2944, 4288, 1, 1, 2944, 2944, 1, 1] + - [34, 378.0] + - - [5056, 5888, 1, 32, 5056, 5056, 32, 32] + - [34, 16295.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [7, 20832.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [7, 21239.0] + - - [1024, 6784, 1, 32, 1024, 1024, 32, 32] + - [17, 11027.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [38, 20407.0] + - - [1408, 5888, 1, 1, 1408, 1408, 1, 1] + - [2, 437.0] + - - [704, 4288, 1, 32, 704, 704, 32, 32] + - [41, 7318.0] + - - [1408, 5888, 1, 32, 1408, 1408, 32, 32] + - [41, 12259.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [48, 21329.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [46, 19634.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [38, 19231.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [30, 18043.0] + - - [1408, 6784, 1, 1, 1408, 1408, 1, 1] + - [8, 353.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [7, 21466.0] + - - [1024, 5888, 1, 1, 1024, 1024, 1, 1] + - [9, 300.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [38, 18514.0] + - - [2368, 704, 1, 32, 2368, 2368, 32, 32] + - [26, 7578.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [30, 17363.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [24, 19949.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [22, 20019.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [12, 16940.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [7, 20163.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [14, 20994.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [20, 21637.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [14, 20471.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21144.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [14, 21251.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [12, 20188.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 17075.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [14, 20784.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [32, 21584.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 20505.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [14, 20457.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [20, 21517.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [48, 20678.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [14, 19126.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [14, 20457.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [14, 22113.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [14, 20580.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [22, 16684.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 21770.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [14, 21803.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21401.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 17808.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [40, 21451.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 14189.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [14, 21137.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 21261.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [31, 12505.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 21211.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 19140.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [12, 21005.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [14, 21546.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [12, 20151.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 19650.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [14, 21176.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [14, 20576.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [14, 21058.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [14, 20450.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [14, 21399.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [14, 21391.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [46, 14985.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 19754.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [22, 16674.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [48, 18922.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [12, 19944.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [30, 16877.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 21674.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [12, 19072.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 17826.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [48, 18374.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [32, 21879.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [14, 21016.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 19709.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 16422.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 15604.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [38, 20038.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [14, 21015.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [12, 20235.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [14, 18272.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [12, 19949.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 14926.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 20055.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [14, 20261.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [20, 21327.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [14, 21547.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 12961.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [14, 20830.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [14, 21578.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [14, 15899.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [3, 18970.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [12, 20153.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 16082.0] + - - [2048, 2048, 1, 2049, 2048, 2048, 2049, 2049] + - [22, 19802.0] + - - [8192, 8191, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 18346.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 18444.0] + - - [2047, 2048, 1, 2048, 2047, 2047, 2048, 2048] + - [14, 18086.0] + - - [2048, 2049, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 17489.0] + - - [8192, 8192, 1, 8191, 8192, 8192, 8191, 8191] + - [7, 21993.0] + - - [3072, 513, 1, 3072, 3072, 3072, 3072, 3072] + - [30, 14516.0] + - - [8191, 8192, 1, 8192, 8191, 8191, 8192, 8192] + - [14, 18180.0] + - - [8192, 8193, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 17986.0] + - - [4096, 4097, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 15468.0] + - - [8192, 8192, 1, 8193, 8192, 8192, 8193, 8193] + - [14, 21941.0] + - - [4096, 4095, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 16317.0] + - - [4096, 4096, 1, 4097, 4096, 4096, 4097, 4097] + - [32, 20336.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [14, 18274.0] + - - [4095, 4096, 1, 4096, 4095, 4095, 4096, 4096] + - [16, 16260.0] + - - [8193, 8192, 1, 8192, 8193, 8193, 8192, 8192] + - [14, 18198.0] + - - [4096, 4096, 1, 4095, 4096, 4096, 4095, 4095] + - [24, 20160.0] + - - [3072, 511, 1, 3072, 3072, 3072, 3072, 3072] + - [13, 16595.0] + - - [2049, 2048, 1, 2048, 2049, 2049, 2048, 2048] + - [14, 18739.0] + - - [2048, 2047, 1, 2048, 2048, 2048, 2048, 2048] + - [14, 18228.0] + - - [2048, 2048, 1, 2047, 2048, 2048, 2047, 2047] + - [5, 20172.0] + - - [4097, 4096, 1, 4096, 4097, 4097, 4096, 4096] + - [48, 19036.0] + - - [128, 128, 512, 64, 128, 128, 64, 64] + - [39, 11360.0] + - - [512, 512, 64, 64, 512, 512, 64, 64] + - [31, 16973.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 17462.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 16872.0] + - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 19731.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 19104.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20696.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21315.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 1024, 1024] + - [14, 21379.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 1024, 1024] + - [14, 21452.0] + - - [128, 32768, 1, 256, 128, 128, 256, 256] + - [46, 18392.0] + - - [256, 4608, 1, 1024, 256, 256, 1024, 1024] + - [14, 17859.0] + - - [256, 4864, 1, 1024, 256, 256, 1024, 1024] + - [48, 17828.0] + - - [256, 5376, 1, 1024, 256, 256, 1024, 1024] + - [12, 15655.0] + - - [256, 5888, 1, 1024, 256, 256, 1024, 1024] + - [12, 16847.0] + - - [256, 6144, 1, 1024, 256, 256, 1024, 1024] + - [12, 17549.0] + - - [256, 6400, 1, 1024, 256, 256, 1024, 1024] + - [12, 18102.0] + - - [256, 6656, 1, 1024, 256, 256, 1024, 1024] + - [12, 16739.0] + - - [256, 7168, 1, 1024, 256, 256, 1024, 1024] + - [14, 17896.0] + - - [256, 7424, 1, 1024, 256, 256, 1024, 1024] + - [14, 18496.0] + - - [256, 7936, 1, 1024, 256, 256, 1024, 1024] + - [12, 17188.0] + - - [256, 8192, 1, 1024, 256, 256, 1024, 1024] + - [12, 17637.0] + - - [256, 8448, 1, 1024, 256, 256, 1024, 1024] + - [12, 18248.0] + - - [256, 8960, 1, 1024, 256, 256, 1024, 1024] + - [12, 18594.0] + - - [256, 9984, 1, 1024, 256, 256, 1024, 1024] + - [14, 19093.0] + - - [256, 10496, 1, 1024, 256, 256, 1024, 1024] + - [12, 17871.0] + - - [256, 11264, 1, 1024, 256, 256, 1024, 1024] + - [12, 18786.0] + - - [256, 11520, 1, 1024, 256, 256, 1024, 1024] + - [12, 19101.0] + - - [256, 11776, 1, 1024, 256, 256, 1024, 1024] + - [14, 18621.0] + - - [256, 12544, 1, 1024, 256, 256, 1024, 1024] + - [32, 19495.0] + - - [256, 13312, 1, 1024, 256, 256, 1024, 1024] + - [12, 18814.0] + - - [256, 14336, 1, 1024, 256, 256, 1024, 1024] + - [32, 18940.0] + - - [256, 14592, 1, 1024, 256, 256, 1024, 1024] + - [48, 19078.0] + - - [256, 14848, 1, 1024, 256, 256, 1024, 1024] + - [32, 19481.0] + - - [256, 15104, 1, 1024, 256, 256, 1024, 1024] + - [48, 19758.0] + - - [256, 16128, 1, 1024, 256, 256, 1024, 1024] + - [12, 19207.0] + - - [256, 18176, 1, 1024, 256, 256, 1024, 1024] + - [46, 19512.0] + - - [256, 18944, 1, 1024, 256, 256, 1024, 1024] + - [12, 20018.0] + - - [256, 19200, 1, 1024, 256, 256, 1024, 1024] + - [12, 20256.0] + - - [256, 20480, 1, 1024, 256, 256, 1024, 1024] + - [32, 20846.0] + - - [256, 20992, 1, 1024, 256, 256, 1024, 1024] + - [46, 19873.0] + - - [256, 21248, 1, 1024, 256, 256, 1024, 1024] + - [12, 20077.0] + - - [256, 21504, 1, 1024, 256, 256, 1024, 1024] + - [46, 20218.0] + - - [256, 22016, 1, 1024, 256, 256, 1024, 1024] + - [32, 20236.0] + - - [256, 22344, 1, 1024, 256, 256, 1024, 1024] + - [14, 20009.0] + - - [256, 23296, 1, 1024, 256, 256, 1024, 1024] + - [30, 19682.0] + - - [256, 23552, 1, 1024, 256, 256, 1024, 1024] + - [12, 20159.0] + - - [256, 31488, 1, 1024, 256, 256, 1024, 1024] + - [12, 20330.0] + - - [256, 32768, 1, 512, 256, 256, 512, 512] + - [32, 20417.0] + - - [256, 33536, 1, 1024, 256, 256, 1024, 1024] + - [32, 20297.0] + - - [256, 44505, 1, 1024, 256, 256, 1024, 1024] + - [46, 20359.0] + - - [512, 32768, 1, 13, 512, 512, 13, 13] + - [0, 6242.0] + - - [512, 32768, 1, 1024, 512, 512, 1024, 1024] + - [14, 21101.0] + - - [684, 8976, 1, 256, 684, 684, 256, 256] + - [32, 16678.0] + - - [1024, 1600, 1, 560, 1024, 1024, 560, 560] + - [20, 16361.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 16529.0] + - - [1024, 32768, 1, 480, 1024, 1024, 480, 480] + - [7, 21522.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20921.0] + - - [1280, 8976, 1, 256, 1280, 1280, 256, 256] + - [38, 19510.0] + - - [1792, 8976, 1, 256, 1792, 1792, 256, 256] + - [5, 19937.0] + - - [2048, 684, 1, 512, 2048, 2048, 512, 512] + - [30, 15208.0] + - - [2048, 684, 1, 768, 2048, 2048, 768, 768] + - [46, 15971.0] + - - [2048, 960, 1, 74, 2048, 2048, 74, 74] + - [4, 9185.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 16368.0] + - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] + - [14, 18134.0] + - - [2048, 1536, 1, 768, 2048, 2048, 768, 768] + - [7, 18845.0] + - - [2048, 8976, 1, 256, 2048, 2048, 256, 256] + - [5, 19988.0] + - - [2304, 8976, 1, 256, 2304, 2304, 256, 256] + - [30, 20150.0] + - - [2560, 8976, 1, 256, 2560, 2560, 256, 256] + - [30, 20227.0] + - - [2816, 8976, 1, 256, 2816, 2816, 256, 256] + - [22, 20260.0] + - - [3072, 8976, 1, 256, 3072, 3072, 256, 256] + - [12, 20281.0] + - - [3328, 8976, 1, 256, 3328, 3328, 256, 256] + - [14, 20330.0] + - - [3840, 8976, 1, 256, 3840, 3840, 256, 256] + - [32, 20582.0] + - - [4096, 8976, 1, 256, 4096, 4096, 256, 256] + - [5, 20467.0] + - - [4352, 8976, 1, 256, 4352, 4352, 256, 256] + - [14, 20606.0] + - - [4608, 8976, 1, 256, 4608, 4608, 256, 256] + - [38, 20511.0] + - - [4864, 8976, 1, 256, 4864, 4864, 256, 256] + - [14, 20607.0] + - - [5120, 8976, 1, 256, 5120, 5120, 256, 256] + - [38, 20462.0] + - - [5376, 8976, 1, 256, 5376, 5376, 256, 256] + - [24, 20578.0] + - - [5632, 8976, 1, 256, 5632, 5632, 256, 256] + - [38, 20541.0] + - - [5888, 8976, 1, 256, 5888, 5888, 256, 256] + - [24, 20582.0] + - - [6144, 8976, 1, 256, 6144, 6144, 256, 256] + - [22, 20562.0] + - - [6400, 8976, 1, 256, 6400, 6400, 256, 256] + - [40, 20612.0] + - - [7168, 8976, 1, 256, 7168, 7168, 256, 256] + - [14, 20570.0] + - - [7936, 8976, 1, 256, 7936, 7936, 256, 256] + - [24, 20814.0] + - - [8192, 8976, 1, 256, 8192, 8192, 256, 256] + - [14, 20642.0] + - - [8448, 8976, 1, 256, 8448, 8448, 256, 256] + - [24, 20711.0] + - - [8960, 8976, 1, 256, 8960, 8960, 256, 256] + - [40, 20735.0] + - - [9472, 8976, 1, 256, 9472, 9472, 256, 256] + - [14, 20778.0] + - - [9728, 8976, 1, 256, 9728, 9728, 256, 256] + - [24, 20928.0] + - - [9984, 8976, 1, 256, 9984, 9984, 256, 256] + - [14, 20929.0] + - - [10240, 8976, 1, 256, 10240, 10240, 256, 256] + - [12, 20861.0] + - - [10496, 8976, 1, 256, 10496, 10496, 256, 256] + - [38, 20899.0] + - - [11264, 8976, 1, 256, 11264, 11264, 256, 256] + - [22, 20900.0] + - - [11776, 8976, 1, 256, 11776, 11776, 256, 256] + - [12, 20882.0] + - - [12544, 8976, 1, 256, 12544, 12544, 256, 256] + - [14, 20953.0] + - - [13312, 8976, 1, 256, 13312, 13312, 256, 256] + - [22, 20937.0] + - - [13568, 8976, 1, 256, 13568, 13568, 256, 256] + - [24, 20950.0] + - - [13824, 8976, 1, 256, 13824, 13824, 256, 256] + - [38, 20932.0] + - - [15104, 8976, 1, 256, 15104, 15104, 256, 256] + - [24, 21001.0] + - - [15360, 8976, 1, 256, 15360, 15360, 256, 256] + - [22, 20951.0] + - - [15872, 8976, 1, 256, 15872, 15872, 256, 256] + - [22, 20973.0] + - - [16128, 8976, 1, 256, 16128, 16128, 256, 256] + - [14, 20990.0] + - - [17152, 8976, 1, 256, 17152, 17152, 256, 256] + - [24, 20998.0] + - - [18176, 8976, 1, 256, 18176, 18176, 256, 256] + - [24, 21016.0] + - - [18688, 8976, 1, 256, 18688, 18688, 256, 256] + - [7, 21009.0] + - - [18944, 8976, 1, 256, 18944, 18944, 256, 256] + - [38, 20962.0] + - - [19712, 8976, 1, 256, 19712, 19712, 256, 256] + - [7, 21028.0] + - - [19968, 8976, 1, 256, 19968, 19968, 256, 256] + - [22, 20971.0] + - - [20480, 8976, 1, 256, 20480, 20480, 256, 256] + - [30, 20971.0] + - - [20992, 8976, 1, 256, 20992, 20992, 256, 256] + - [30, 20974.0] + - - [21248, 8976, 1, 256, 21248, 21248, 256, 256] + - [14, 21024.0] + - - [23552, 8976, 1, 256, 23552, 23552, 256, 256] + - [5, 20980.0] + - - [28672, 8976, 1, 256, 28672, 28672, 256, 256] + - [5, 20984.0] + - - [31488, 8976, 1, 256, 31488, 31488, 256, 256] + - [7, 21064.0] + - - [33536, 8976, 1, 256, 33536, 33536, 256, 256] + - [14, 21113.0] + - - [44505, 8976, 1, 256, 44505, 44505, 256, 256] + - [7, 21039.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20396.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 20486.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 18842.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 19195.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 19522.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 17944.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 18790.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 16610.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 19542.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 4096, 4096] + - [33, 16119.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 19602.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 15677.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 19683.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 4096, 4096] + - [33, 15517.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 19964.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 13778.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [32, 19699.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 4096, 4096] + - [33, 16529.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20686.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 4096, 4096] + - [33, 17509.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20725.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 4096, 4096] + - [33, 17517.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20935.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19699.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21513.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 20677.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21023.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21198.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20822.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21198.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21379.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21501.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20931.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21059.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21082.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21263.0] + - - [42720, 3968, 1, 1024, 42720, 42720, 1024, 1024] + - [32, 21151.0] + - - [42720, 6528, 1, 1024, 42720, 42720, 1024, 1024] + - [48, 21296.0] + - - [42720, 7104, 1, 1024, 42720, 42720, 1024, 1024] + - [14, 21351.0] + - - [42720, 7200, 1, 1024, 42720, 42720, 1024, 1024] + - [48, 21229.0] + - - [42720, 9520, 1, 1024, 42720, 42720, 1024, 1024] + - [48, 21103.0] + - - [42720, 10080, 1, 1024, 42720, 42720, 1024, 1024] + - [48, 21228.0] + - - [1024, 3240, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 16455.0] + - - [1024, 3240, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 17520.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 18600.0] + - - [1024, 3960, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 19022.0] + - - [4096, 3240, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20693.0] + - - [4096, 3960, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20725.0] + - - [42720, 3960, 1, 1024, 42720, 42720, 1024, 1024] + - [32, 21314.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [14, 18198.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [51, 17046.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [14, 19176.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 18254.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 16678.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [13, 17834.0] + - - [512, 512, 16, 64, 512, 512, 64, 64] + - [21, 12567.0] + - - [512, 512, 128, 64, 512, 512, 64, 64] + - [46, 18742.0] + - - [4096, 512, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 18308.0] + - - [30522, 616, 1, 1024, 30522, 30522, 1024, 1024] + - [12, 19858.0] + - - [128, 128, 128, 64, 128, 128, 64, 64] + - [45, 9656.0] + - - [128, 128, 160, 64, 128, 128, 64, 64] + - [21, 10966.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 19503.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19376.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20147.0] + - - [30522, 160, 1, 1024, 30522, 30522, 1024, 1024] + - [46, 12966.0] + - - [30522, 200, 1, 1024, 30522, 30522, 1024, 1024] + - [46, 16178.0] + - - [128, 128, 624, 64, 128, 128, 64, 64] + - [4, 13107.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20059.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 20341.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 21029.0] + - - [30522, 780, 1, 1024, 30522, 30522, 1024, 1024] + - [12, 18320.0] + - - [30522, 308, 1, 1024, 30522, 30522, 1024, 1024] + - [30, 16577.0] + - - [128, 128, 640, 64, 128, 128, 64, 64] + - [39, 14128.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20262.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18601.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21683.0] + - - [30522, 800, 1, 1024, 30522, 30522, 1024, 1024] + - [12, 18651.0] + - - [128, 128, 656, 64, 128, 128, 64, 64] + - [31, 15025.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 19826.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 4096, 4096] + - [48, 18204.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21037.0] + - - [30522, 820, 1, 1024, 30522, 30522, 1024, 1024] + - [12, 19168.0] + - - [512, 512, 80, 64, 512, 512, 64, 64] + - [46, 18123.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20671.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 20139.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21287.0] + - - [30522, 385, 1, 1024, 30522, 30522, 1024, 1024] + - [14, 15960.0] + - - [512, 512, 96, 64, 512, 512, 64, 64] + - [22, 18641.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20113.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 20179.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 20675.0] + - - [30522, 462, 1, 1024, 30522, 30522, 1024, 1024] + - [14, 18790.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 19512.0] + - - [128, 128, 144, 64, 128, 128, 64, 64] + - [11, 8977.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [48, 16703.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 4096, 4096] + - [48, 17745.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 18945.0] + - - [30522, 180, 1, 1024, 30522, 30522, 1024, 1024] + - [46, 14575.0] + - - [1024, 32768, 1, 479, 1024, 1024, 479, 479] + - [7, 21165.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20867.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 4096, 4096] + - [33, 15668.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 20442.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 4096, 4096] + - [51, 16643.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21702.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21313.0] + - - [33712, 8192, 1, 1024, 33712, 33712, 1024, 1024] + - [14, 20679.0] + - - [33712, 9600, 1, 1024, 33712, 33712, 1024, 1024] + - [14, 20779.0] + - - [1024, 1024, 128, 96, 1024, 1024, 96, 96] + - [5, 19928.0] + - - [30592, 4096, 1, 1024, 30592, 30592, 1024, 1024] + - [14, 21645.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [14, 21301.0] + - - [3072, 8192, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21651.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 20145.0] + - - [50304, 8192, 1, 1024, 50304, 50304, 1024, 1024] + - [48, 20983.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 8192, 8192] + - [12, 16725.0] + - - [50304, 2048, 1, 1024, 50304, 50304, 1024, 1024] + - [32, 21829.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 6144, 6144] + - [32, 18604.0] + - - [50304, 4096, 1, 1536, 50304, 50304, 1536, 1536] + - [40, 21397.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 2048, 2048] + - [14, 20495.0] + - - [2560, 2048, 1, 640, 2560, 2560, 640, 640] + - [24, 20575.0] + - - [1024, 1024, 128, 64, 1024, 1024, 64, 64] + - [5, 17692.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 17133.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [14, 20523.0] + - - [1024, 1024, 64, 64, 1024, 1024, 64, 64] + - [5, 17559.0] + - - [30592, 8192, 1, 1024, 30592, 30592, 1024, 1024] + - [14, 20764.0] + - - [50304, 16384, 1, 1024, 50304, 50304, 1024, 1024] + - [48, 21712.0] + - - [4608, 4096, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 21992.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [14, 21138.0] + - - [7680, 2048, 1, 2560, 7680, 7680, 2560, 2560] + - [14, 20034.0] + - - [50304, 4096, 1, 1024, 50304, 50304, 1024, 1024] + - [32, 21241.0] + - - [1920, 2048, 1, 2560, 1920, 1920, 2560, 2560] + - [14, 19977.0] + - - [1024, 1024, 64, 96, 1024, 1024, 96, 96] + - [12, 19582.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 1536, 1536] + - [14, 21625.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 6144, 6144] + - [32, 18975.0] + - - [512, 512, 256, 64, 512, 512, 64, 64] + - [5, 15936.0] + - - [50304, 8192, 1, 1536, 50304, 50304, 1536, 1536] + - [48, 21242.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 1536, 1536] + - [14, 21681.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21358.0] + - - [30592, 1024, 1, 2048, 30592, 30592, 2048, 2048] + - [32, 20540.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18748.0] + - - [512, 512, 40, 64, 512, 512, 64, 64] + - [6, 14841.0] + - - [6144, 1024, 1, 2048, 6144, 6144, 2048, 2048] + - [14, 20108.0] + - - [4608, 8192, 1, 1536, 4608, 4608, 1536, 1536] + - [14, 21642.0] + - - [30592, 2048, 1, 1024, 30592, 30592, 1024, 1024] + - [14, 21556.0] + - - [3072, 16384, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21337.0] + - - [1024, 1024, 256, 64, 1024, 1024, 64, 64] + - [5, 17756.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 21355.0] + - - [1024, 1024, 32, 64, 1024, 1024, 64, 64] + - [22, 18697.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21086.0] + - - [30528, 8192, 1, 1024, 30528, 30528, 1024, 1024] + - [14, 20828.0] + - - [128, 128, 1024, 64, 128, 128, 64, 64] + - [6, 15851.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 18961.0] + - - [1024, 3456, 1, 480, 1024, 1024, 480, 480] + - [12, 19575.0] + - - [1024, 4096, 1, 480, 1024, 1024, 480, 480] + - [12, 19317.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20261.0] + - - [1024, 6912, 1, 480, 1024, 1024, 480, 480] + - [7, 20741.0] + - - [128, 55296, 1, 256, 128, 128, 256, 256] + - [30, 19734.0] + - - [256, 55296, 1, 512, 256, 256, 512, 512] + - [12, 20476.0] + - - [256, 6912, 1, 512, 256, 256, 512, 512] + - [12, 17443.0] + - - [512, 3456, 1, 1024, 512, 512, 1024, 1024] + - [30, 17250.0] + - - [512, 3456, 1, 13, 512, 512, 13, 13] + - [23, 1973.0] + - - [512, 4096, 1, 1024, 512, 512, 1024, 1024] + - [12, 17704.0] + - - [512, 4096, 1, 13, 512, 512, 13, 13] + - [26, 3141.0] + - - [512, 55296, 1, 13, 512, 512, 13, 13] + - [47, 7719.0] + - - [512, 6912, 1, 1024, 512, 512, 1024, 1024] + - [30, 19129.0] + - - [512, 6912, 1, 13, 512, 512, 13, 13] + - [21, 3753.0] + - - [30528, 640, 1, 1024, 30528, 30528, 1024, 1024] + - [12, 20583.0] + - - [30528, 1280, 1, 1024, 30528, 30528, 1024, 1024] + - [48, 21396.0] + - - [30528, 1600, 1, 1024, 30528, 30528, 1024, 1024] + - [12, 20200.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 21169.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21546.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19138.0] + - - [128, 128, 1280, 64, 128, 128, 64, 64] + - [6, 13615.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19480.0] + - - [30528, 1640, 1, 1024, 30528, 30528, 1024, 1024] + - [12, 20752.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21499.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20773.0] + - - [128, 128, 1312, 64, 128, 128, 64, 64] + - [22, 17574.0] + - - [30528, 160, 1, 1024, 30528, 30528, 1024, 1024] + - [14, 12957.0] + - - [30528, 240, 1, 1024, 30528, 30528, 1024, 1024] + - [12, 18884.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 19892.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21547.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 18865.0] + - - [512, 512, 192, 64, 512, 512, 64, 64] + - [12, 17583.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 21003.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20990.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 21008.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19170.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21315.0] + - - [3072, 10224, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21549.0] + - - [3072, 10240, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21674.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19410.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21426.0] + - - [3072, 10192, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21538.0] + - - [3072, 10200, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21533.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 21014.0] + - - [3072, 10208, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21502.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 18734.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21409.0] + - - [2048, 10224, 1, 1024, 2048, 2048, 1024, 1024] + - [14, 21405.0] + - - [2048, 10240, 1, 1024, 2048, 2048, 1024, 1024] + - [14, 21610.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20897.0] + - - [2048, 10192, 1, 1024, 2048, 2048, 1024, 1024] + - [14, 21508.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20940.0] + - - [3072, 10080, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21379.0] + - - [1024, 2048, 1, 49, 1024, 1024, 49, 49] + - [26, 9938.0] + - - [4608, 512, 1, 49, 4608, 4608, 49, 49] + - [17, 11137.0] + - - [256, 256, 25, 12544, 256, 256, 12544, 12544] + - [49, 15440.0] + - - [256, 256, 49, 3200, 256, 256, 3200, 3200] + - [24, 20960.0] + - - [256, 256, 25, 6272, 256, 256, 6272, 6272] + - [38, 19949.0] + - - [256, 256, 49, 6400, 256, 256, 6400, 6400] + - [32, 18601.0] + - - [512, 512, 49, 1152, 512, 512, 1152, 1152] + - [40, 21572.0] + - - [512, 512, 25, 2048, 512, 512, 2048, 2048] + - [14, 18831.0] + - - [512, 512, 49, 2304, 512, 512, 2304, 2304] + - [14, 20831.0] + - - [512, 512, 25, 4096, 512, 512, 4096, 4096] + - [51, 17908.0] + - - [128, 128, 2048, 64, 128, 128, 64, 64] + - [29, 12101.0] + - - [30528, 2560, 1, 1024, 30528, 30528, 1024, 1024] + - [14, 21455.0] + - - [128, 128, 1536, 64, 128, 128, 64, 64] + - [5, 17000.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 20931.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 19400.0] + - - [30528, 1920, 1, 1024, 30528, 30528, 1024, 1024] + - [12, 21118.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21453.0] + - - [128, 128, 81, 12544, 128, 128, 12544, 12544] + - [15, 12598.0] + - - [128, 128, 121, 9216, 128, 128, 9216, 9216] + - [49, 12608.0] + - - [128, 128, 169, 6400, 128, 128, 6400, 6400] + - [49, 14826.0] + - - [256, 256, 36, 4096, 256, 256, 4096, 4096] + - [16, 15752.0] + - - [256, 256, 49, 2304, 256, 256, 2304, 2304] + - [14, 18906.0] + - - [256, 256, 64, 2304, 256, 256, 2304, 2304] + - [14, 17764.0] + - - [256, 256, 81, 4096, 256, 256, 4096, 4096] + - [33, 15831.0] + - - [256, 256, 121, 2304, 256, 256, 2304, 2304] + - [48, 18523.0] + - - [256, 256, 169, 2304, 256, 256, 2304, 2304] + - [32, 19708.0] + - - [512, 512, 81, 1024, 512, 512, 1024, 1024] + - [14, 20241.0] + - - [512, 512, 121, 1024, 512, 512, 1024, 1024] + - [14, 20526.0] + - - [512, 512, 169, 1024, 512, 512, 1024, 1024] + - [14, 20627.0] + - - [512, 512, 36, 1024, 512, 512, 1024, 1024] + - [14, 20365.0] + - - [512, 512, 49, 1024, 512, 512, 1024, 1024] + - [14, 20196.0] + - - [512, 512, 64, 1024, 512, 512, 1024, 1024] + - [14, 20086.0] + - - [128, 128, 192, 64, 128, 128, 64, 64] + - [4, 12157.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [46, 18636.0] + - - [3072, 2048, 1, 768, 3072, 3072, 768, 768] + - [30, 20203.0] + - - [768, 2048, 1, 3072, 768, 768, 3072, 3072] + - [47, 16902.0] + - - [384, 384, 144, 64, 384, 384, 64, 64] + - [46, 17736.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [30, 19347.0] + - - [3072, 4608, 1, 768, 3072, 3072, 768, 768] + - [14, 21072.0] + - - [768, 4608, 1, 3072, 768, 768, 3072, 3072] + - [14, 19489.0] + - - [512, 512, 48, 64, 512, 512, 64, 64] + - [6, 16004.0] + - - [128, 128, 256, 64, 128, 128, 64, 64] + - [4, 11056.0] + - - [384, 384, 192, 64, 384, 384, 64, 64] + - [46, 18033.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 19653.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21563.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 19114.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [14, 21012.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [14, 21097.0] + - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] + - [13, 16870.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [12, 16903.0] + - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] + - [13, 16655.0] + - - [512, 4096, 1, 4096, 512, 512, 4096, 4096] + - [46, 17413.0] + - - [512, 8192, 1, 8192, 512, 512, 8192, 8192] + - [48, 16540.0] + - - [8192, 512, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 17805.0] + - - [256, 256, 36, 432, 256, 256, 432, 432] + - [14, 17621.0] + - - [256, 256, 36, 456, 256, 256, 456, 456] + - [44, 17788.0] + - - [256, 256, 36, 504, 256, 256, 504, 504] + - [5, 17748.0] + - - [256, 256, 49, 1120, 256, 256, 1120, 1120] + - [7, 20569.0] + - - [256, 256, 36, 442, 256, 256, 442, 442] + - [14, 17056.0] + - - [256, 256, 49, 950, 256, 256, 950, 950] + - [24, 19511.0] + - - [256, 256, 64, 616, 256, 256, 616, 616] + - [22, 18758.0] + - - [256, 256, 64, 660, 256, 256, 660, 660] + - [22, 18842.0] + - - [256, 256, 36, 408, 256, 256, 408, 408] + - [48, 17001.0] + - - [256, 256, 49, 1008, 256, 256, 1008, 1008] + - [48, 17452.0] + - - [256, 256, 36, 462, 256, 256, 462, 462] + - [32, 17069.0] + - - [256, 256, 36, 468, 256, 256, 468, 468] + - [24, 16940.0] + - - [256, 256, 36, 494, 256, 256, 494, 494] + - [14, 17627.0] + - - [512, 512, 64, 48, 512, 512, 48, 48] + - [39, 15451.0] + - - [256, 256, 64, 140, 256, 256, 140, 140] + - [39, 14477.0] + - - [512, 512, 64, 56, 512, 512, 56, 56] + - [20, 15844.0] + - - [512, 512, 49, 90, 512, 512, 90, 90] + - [12, 16896.0] + - - [512, 512, 49, 60, 512, 512, 60, 60] + - [31, 15304.0] + - - [256, 256, 49, 864, 256, 256, 864, 864] + - [24, 19514.0] + - - [256, 256, 64, 224, 256, 256, 224, 224] + - [5, 18568.0] + - - [256, 256, 64, 176, 256, 256, 176, 176] + - [22, 16986.0] + - - [256, 256, 64, 154, 256, 256, 154, 154] + - [22, 14815.0] + - - [512, 512, 49, 80, 512, 512, 80, 80] + - [32, 17364.0] + - - [256, 256, 49, 1200, 256, 256, 1200, 1200] + - [7, 20687.0] + - - [256, 256, 64, 704, 256, 256, 704, 704] + - [22, 19483.0] + - - [256, 256, 64, 768, 256, 256, 768, 768] + - [5, 18453.0] + - - [256, 256, 49, 1160, 256, 256, 1160, 1160] + - [7, 15351.0] + - - [256, 256, 49, 320, 256, 256, 320, 320] + - [12, 18079.0] + - - [512, 512, 49, 70, 512, 512, 70, 70] + - [47, 15086.0] + - - [256, 256, 49, 1240, 256, 256, 1240, 1240] + - [14, 16150.0] + - - [256, 256, 36, 384, 256, 256, 384, 384] + - [32, 17303.0] + - - [1024, 2048, 1, 888, 1024, 1024, 888, 888] + - [1, 18218.0] + - - [1024, 2048, 1, 713, 1024, 1024, 713, 713] + - [5, 17448.0] + - - [1024, 2048, 1, 660, 1024, 1024, 660, 660] + - [1, 17686.0] + - - [1024, 2048, 1, 726, 1024, 1024, 726, 726] + - [1, 17704.0] + - - [1024, 2048, 1, 672, 1024, 1024, 672, 672] + - [1, 18495.0] + - - [1024, 2048, 1, 850, 1024, 1024, 850, 850] + - [5, 17854.0] + - - [1024, 2048, 1, 805, 1024, 1024, 805, 805] + - [1, 17767.0] + - - [1024, 2048, 1, 864, 1024, 1024, 864, 864] + - [12, 17993.0] + - - [1024, 2048, 1, 768, 1024, 1024, 768, 768] + - [30, 17480.0] + - - [1024, 2048, 1, 950, 1024, 1024, 950, 950] + - [22, 18211.0] + - - [256, 128, 49, 1152, 256, 256, 1152, 1152] + - [22, 18575.0] + - - [256, 128, 121, 120, 256, 256, 120, 120] + - [47, 12993.0] + - - [256, 128, 169, 120, 256, 256, 120, 120] + - [39, 15383.0] + - - [256, 128, 36, 120, 256, 256, 120, 120] + - [1, 12776.0] + - - [256, 128, 49, 120, 256, 256, 120, 120] + - [6, 12966.0] + - - [256, 128, 64, 120, 256, 256, 120, 120] + - [47, 11481.0] + - - [256, 128, 36, 12000, 256, 256, 12000, 12000] + - [30, 18308.0] + - - [256, 128, 49, 1216, 256, 256, 1216, 1216] + - [5, 19067.0] + - - [256, 128, 121, 18, 256, 256, 18, 18] + - [11, 5047.0] + - - [256, 128, 169, 18, 256, 256, 18, 18] + - [4, 5313.0] + - - [256, 128, 36, 18, 256, 256, 18, 18] + - [0, 4537.0] + - - [256, 128, 49, 18, 256, 256, 18, 18] + - [0, 4722.0] + - - [256, 128, 64, 18, 256, 256, 18, 18] + - [11, 3798.0] + - - [256, 128, 36, 1800, 256, 256, 1800, 1800] + - [38, 17230.0] + - - [256, 128, 121, 19, 256, 256, 19, 19] + - [4, 5451.0] + - - [256, 128, 169, 19, 256, 256, 19, 19] + - [11, 6316.0] + - - [256, 128, 36, 19, 256, 256, 19, 19] + - [8, 3096.0] + - - [256, 128, 49, 19, 256, 256, 19, 19] + - [8, 3405.0] + - - [256, 128, 64, 19, 256, 256, 19, 19] + - [8, 3817.0] + - - [256, 128, 36, 1900, 256, 256, 1900, 1900] + - [38, 17262.0] + - - [256, 128, 49, 480, 256, 256, 480, 480] + - [5, 16581.0] + - - [256, 128, 81, 480, 256, 256, 480, 480] + - [5, 16477.0] + - - [256, 128, 64, 5880, 256, 256, 5880, 5880] + - [38, 17717.0] + - - [256, 128, 49, 72, 256, 256, 72, 72] + - [27, 10141.0] + - - [256, 128, 81, 72, 256, 256, 72, 72] + - [4, 12442.0] + - - [256, 128, 49, 76, 256, 256, 76, 76] + - [11, 10857.0] + - - [256, 128, 81, 76, 256, 256, 76, 76] + - [4, 11487.0] + - - [256, 128, 49, 7680, 256, 256, 7680, 7680] + - [15, 14356.0] + - - [256, 128, 64, 882, 256, 256, 882, 882] + - [22, 18102.0] + - - [256, 128, 64, 931, 256, 256, 931, 931] + - [22, 18082.0] + - - [256, 256, 49, 1152, 256, 256, 1152, 1152] + - [7, 20591.0] + - - [256, 256, 36, 12000, 256, 256, 12000, 12000] + - [7, 19816.0] + - - [256, 256, 49, 1216, 256, 256, 1216, 1216] + - [20, 20817.0] + - - [256, 256, 36, 1800, 256, 256, 1800, 1800] + - [7, 19150.0] + - - [256, 256, 36, 1900, 256, 256, 1900, 1900] + - [24, 15360.0] + - - [256, 256, 64, 5880, 256, 256, 5880, 5880] + - [22, 20675.0] + - - [256, 256, 49, 7680, 256, 256, 7680, 7680] + - [16, 17458.0] + - - [256, 256, 64, 882, 256, 256, 882, 882] + - [22, 20094.0] + - - [256, 256, 64, 931, 256, 256, 931, 931] + - [38, 20060.0] + - - [512, 256, 81, 1080, 512, 512, 1080, 1080] + - [40, 21050.0] + - - [512, 256, 25, 12000, 512, 512, 12000, 12000] + - [40, 21855.0] + - - [512, 256, 81, 162, 512, 512, 162, 162] + - [22, 18442.0] + - - [512, 256, 81, 171, 512, 512, 171, 171] + - [5, 19042.0] + - - [512, 256, 25, 1800, 512, 512, 1800, 1800] + - [7, 21264.0] + - - [512, 256, 25, 1900, 512, 512, 1900, 1900] + - [24, 21266.0] + - - [512, 256, 121, 1920, 512, 512, 1920, 1920] + - [40, 21713.0] + - - [512, 256, 169, 1920, 512, 512, 1920, 1920] + - [40, 22079.0] + - - [512, 256, 49, 1920, 512, 512, 1920, 1920] + - [24, 21303.0] + - - [512, 256, 121, 288, 512, 512, 288, 288] + - [24, 20623.0] + - - [512, 256, 169, 288, 512, 512, 288, 288] + - [5, 20707.0] + - - [512, 256, 49, 288, 512, 512, 288, 288] + - [7, 19774.0] + - - [512, 256, 25, 3000, 512, 512, 3000, 3000] + - [40, 21554.0] + - - [512, 256, 81, 3000, 512, 512, 3000, 3000] + - [7, 21445.0] + - - [512, 256, 121, 304, 512, 512, 304, 304] + - [22, 20616.0] + - - [512, 256, 169, 304, 512, 512, 304, 304] + - [40, 20783.0] + - - [512, 256, 49, 304, 512, 512, 304, 304] + - [5, 19798.0] + - - [512, 256, 25, 450, 512, 512, 450, 450] + - [38, 19433.0] + - - [512, 256, 81, 450, 512, 512, 450, 450] + - [38, 20223.0] + - - [512, 256, 25, 475, 512, 512, 475, 475] + - [40, 19559.0] + - - [512, 256, 81, 475, 512, 512, 475, 475] + - [5, 19996.0] + - - [512, 256, 121, 480, 512, 512, 480, 480] + - [24, 20988.0] + - - [512, 256, 169, 480, 512, 512, 480, 480] + - [24, 21351.0] + - - [512, 256, 49, 5880, 512, 512, 5880, 5880] + - [40, 21693.0] + - - [512, 256, 121, 72, 512, 512, 72, 72] + - [23, 16156.0] + - - [512, 256, 169, 72, 512, 512, 72, 72] + - [5, 16795.0] + - - [512, 256, 121, 76, 512, 512, 76, 76] + - [47, 16681.0] + - - [512, 256, 169, 76, 512, 512, 76, 76] + - [22, 17002.0] + - - [512, 256, 49, 882, 512, 512, 882, 882] + - [7, 20867.0] + - - [512, 256, 49, 931, 512, 512, 931, 931] + - [24, 20775.0] + - - [2304, 512, 1, 100, 2304, 2304, 100, 100] + - [18, 13781.0] + - - [2304, 512, 1, 361, 2304, 2304, 361, 361] + - [1, 17144.0] + - - [4608, 510, 1, 100, 4608, 4608, 100, 100] + - [45, 9891.0] + - - [4608, 510, 1, 361, 4608, 4608, 361, 361] + - [30, 16001.0] + - - [340, 256, 49, 1152, 340, 340, 1152, 1152] + - [22, 18034.0] + - - [340, 256, 36, 120, 340, 340, 120, 120] + - [23, 13287.0] + - - [340, 256, 49, 120, 340, 340, 120, 120] + - [6, 13877.0] + - - [340, 256, 64, 120, 340, 340, 120, 120] + - [38, 14205.0] + - - [340, 256, 36, 12000, 340, 340, 12000, 12000] + - [38, 18374.0] + - - [340, 256, 49, 1216, 340, 340, 1216, 1216] + - [12, 18010.0] + - - [340, 256, 36, 18, 340, 340, 18, 18] + - [47, 3989.0] + - - [340, 256, 49, 18, 340, 340, 18, 18] + - [4, 4377.0] + - - [340, 256, 64, 18, 340, 340, 18, 18] + - [21, 4500.0] + - - [340, 256, 36, 1800, 340, 340, 1800, 1800] + - [5, 17897.0] + - - [340, 256, 36, 19, 340, 340, 19, 19] + - [17, 4140.0] + - - [340, 256, 49, 19, 340, 340, 19, 19] + - [11, 4502.0] + - - [340, 256, 64, 19, 340, 340, 19, 19] + - [4, 4794.0] + - - [340, 256, 36, 1900, 340, 340, 1900, 1900] + - [22, 17963.0] + - - [340, 256, 64, 5880, 340, 340, 5880, 5880] + - [24, 18847.0] + - - [340, 256, 49, 7680, 340, 340, 7680, 7680] + - [51, 14535.0] + - - [340, 256, 64, 882, 340, 340, 882, 882] + - [7, 18006.0] + - - [340, 256, 64, 931, 340, 340, 931, 931] + - [24, 18014.0] + - - [510, 256, 49, 120, 510, 510, 120, 120] + - [39, 14780.0] + - - [510, 256, 64, 120, 510, 510, 120, 120] + - [31, 15248.0] + - - [510, 256, 49, 18, 510, 510, 18, 18] + - [23, 3763.0] + - - [510, 256, 64, 18, 510, 510, 18, 18] + - [36, 3931.0] + - - [510, 256, 49, 19, 510, 510, 19, 19] + - [39, 3980.0] + - - [510, 256, 64, 19, 510, 510, 19, 19] + - [19, 4094.0] + - - [510, 256, 36, 480, 510, 510, 480, 480] + - [38, 19237.0] + - - [510, 256, 36, 72, 510, 510, 72, 72] + - [31, 10840.0] + - - [510, 256, 36, 76, 510, 510, 76, 76] + - [39, 11121.0] + - - [510, 512, 36, 1080, 510, 510, 1080, 1080] + - [38, 20658.0] + - - [510, 512, 36, 162, 510, 510, 162, 162] + - [40, 17169.0] + - - [510, 512, 36, 171, 510, 510, 171, 171] + - [38, 17833.0] + - - [510, 512, 49, 1920, 510, 510, 1920, 1920] + - [40, 21473.0] + - - [510, 512, 64, 1920, 510, 510, 1920, 1920] + - [24, 21662.0] + - - [510, 512, 49, 288, 510, 510, 288, 288] + - [40, 19957.0] + - - [510, 512, 64, 288, 510, 510, 288, 288] + - [40, 20318.0] + - - [510, 512, 36, 3000, 510, 510, 3000, 3000] + - [24, 21216.0] + - - [510, 512, 49, 304, 510, 510, 304, 304] + - [40, 19929.0] + - - [510, 512, 64, 304, 510, 510, 304, 304] + - [48, 20369.0] + - - [510, 512, 36, 450, 510, 510, 450, 450] + - [22, 20101.0] + - - [510, 512, 36, 475, 510, 510, 475, 475] + - [30, 20137.0] + - - [510, 512, 49, 480, 510, 510, 480, 480] + - [40, 20723.0] + - - [510, 512, 64, 480, 510, 510, 480, 480] + - [40, 20881.0] + - - [510, 512, 49, 72, 510, 510, 72, 72] + - [38, 13142.0] + - - [510, 512, 64, 72, 510, 510, 72, 72] + - [22, 13676.0] + - - [510, 512, 49, 76, 510, 510, 76, 76] + - [22, 13777.0] + - - [510, 512, 64, 76, 510, 510, 76, 76] + - [30, 14068.0] + - - [1024, 1024, 160, 96, 1024, 1024, 96, 96] + - [5, 19969.0] + - - [2880, 16384, 1, 1920, 2880, 2880, 1920, 1920] + - [24, 21711.0] + - - [1920, 16384, 1, 960, 1920, 1920, 960, 960] + - [20, 22011.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 1920, 1920] + - [24, 22271.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 3840, 3840] + - [14, 21847.0] + - - [25216, 16384, 1, 1920, 25216, 25216, 1920, 1920] + - [7, 21915.0] + - - [1024, 1024, 40, 96, 1024, 1024, 96, 96] + - [30, 19491.0] + - - [2880, 4096, 1, 1920, 2880, 2880, 1920, 1920] + - [3, 21281.0] + - - [1920, 4096, 1, 960, 1920, 1920, 960, 960] + - [20, 21623.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 1920, 1920] + - [3, 22187.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 3840, 3840] + - [14, 21043.0] + - - [25216, 4096, 1, 1920, 25216, 25216, 1920, 1920] + - [24, 22087.0] + - - [1024, 1024, 80, 96, 1024, 1024, 96, 96] + - [12, 19627.0] + - - [2880, 8192, 1, 1920, 2880, 2880, 1920, 1920] + - [24, 21383.0] + - - [1920, 8192, 1, 960, 1920, 1920, 960, 960] + - [20, 21976.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 1920, 1920] + - [40, 22059.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 3840, 3840] + - [32, 21438.0] + - - [25216, 8192, 1, 1920, 25216, 25216, 1920, 1920] + - [7, 21813.0] + - - [1024, 1024, 96, 96, 1024, 1024, 96, 96] + - [5, 19714.0] + - - [1728, 16384, 1, 2304, 1728, 1728, 2304, 2304] + - [7, 21031.0] + - - [2304, 16384, 1, 576, 2304, 2304, 576, 576] + - [20, 22215.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [7, 21752.0] + - - [12672, 16384, 1, 2304, 12672, 12672, 2304, 2304] + - [14, 21684.0] + - - [1024, 1024, 24, 96, 1024, 1024, 96, 96] + - [5, 18795.0] + - - [1728, 4096, 1, 2304, 1728, 1728, 2304, 2304] + - [7, 19908.0] + - - [2304, 4096, 1, 576, 2304, 2304, 576, 576] + - [18, 21943.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [32, 21240.0] + - - [12672, 4096, 1, 2304, 12672, 12672, 2304, 2304] + - [7, 21927.0] + - - [1024, 1024, 48, 96, 1024, 1024, 96, 96] + - [5, 19595.0] + - - [1728, 8192, 1, 2304, 1728, 1728, 2304, 2304] + - [14, 20821.0] + - - [2304, 8192, 1, 576, 2304, 2304, 576, 576] + - [18, 22127.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [48, 21781.0] + - - [12672, 8192, 1, 2304, 12672, 12672, 2304, 2304] + - [14, 21421.0] + - - [1024, 1024, 16, 96, 1024, 1024, 96, 96] + - [12, 18162.0] + - - [1152, 4096, 1, 3072, 1152, 1152, 3072, 3072] + - [14, 19510.0] + - - [3072, 4096, 1, 384, 3072, 3072, 384, 384] + - [7, 20549.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 3072, 3072] + - [14, 20623.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 1536, 1536] + - [14, 21137.0] + - - [6400, 4096, 1, 3072, 6400, 6400, 3072, 3072] + - [48, 21246.0] + - - [1024, 1024, 32, 96, 1024, 1024, 96, 96] + - [12, 18891.0] + - - [1152, 8192, 1, 3072, 1152, 1152, 3072, 3072] + - [14, 18577.0] + - - [3072, 8192, 1, 384, 3072, 3072, 384, 384] + - [7, 21234.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 3072, 3072] + - [40, 19223.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 1536, 1536] + - [14, 21719.0] + - - [6400, 8192, 1, 3072, 6400, 6400, 3072, 3072] + - [48, 21446.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [14, 20733.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 4096, 4096] + - [14, 20515.0] + - - [29000, 199, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 15157.0] + - - [29000, 221, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 16701.0] + - - [29000, 224, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 16824.0] + - - [29000, 229, 1, 2048, 29000, 29000, 2048, 2048] + - [32, 16981.0] + - - [29000, 234, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 17472.0] + - - [29000, 242, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 17907.0] + - - [29000, 246, 1, 2048, 29000, 29000, 2048, 2048] + - [48, 18142.0] + - - [29000, 247, 1, 2048, 29000, 29000, 2048, 2048] + - [48, 18261.0] + - - [29000, 256, 1, 2048, 29000, 29000, 2048, 2048] + - [48, 19492.0] + - - [29000, 262, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 13399.0] + - - [29000, 264, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 13491.0] + - - [29000, 265, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 13523.0] + - - [29000, 274, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 13830.0] + - - [29000, 277, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 14148.0] + - - [29000, 279, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 14241.0] + - - [29000, 288, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 14576.0] + - - [29000, 296, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 14863.0] + - - [29000, 315, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 15671.0] + - - [29000, 335, 1, 2048, 29000, 29000, 2048, 2048] + - [12, 16673.0] + - - [4096, 4096, 1, 2048, 4096, 4096, 2048, 2048] + - [14, 21120.0] + - - [29000, 2283, 1, 1024, 29000, 29000, 1024, 1024] + - [48, 21348.0] + - - [29000, 2296, 1, 1024, 29000, 29000, 1024, 1024] + - [48, 21434.0] + - - [29000, 2306, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20063.0] + - - [29000, 2309, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20142.0] + - - [29000, 2318, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20187.0] + - - [29000, 2320, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20211.0] + - - [29000, 2324, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20263.0] + - - [29000, 2325, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20265.0] + - - [29000, 2329, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20312.0] + - - [29000, 2338, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20344.0] + - - [29000, 2345, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20470.0] + - - [29000, 2350, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20468.0] + - - [29000, 2362, 1, 1024, 29000, 29000, 1024, 1024] + - [46, 20706.0] + - - [29000, 2366, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20570.0] + - - [29000, 2368, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20626.0] + - - [29000, 2374, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20593.0] + - - [29000, 2390, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20740.0] + - - [512, 512, 320, 64, 512, 512, 64, 64] + - [5, 15869.0] + - - [29000, 561, 1, 1024, 29000, 29000, 1024, 1024] + - [46, 18436.0] + - - [29000, 574, 1, 1024, 29000, 29000, 1024, 1024] + - [46, 18835.0] + - - [29000, 600, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 19563.0] + - - [29000, 608, 1, 1024, 29000, 29000, 1024, 1024] + - [30, 19807.0] + - - [29000, 615, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20019.0] + - - [29000, 622, 1, 1024, 29000, 29000, 1024, 1024] + - [30, 20137.0] + - - [29000, 625, 1, 1024, 29000, 29000, 1024, 1024] + - [46, 20159.0] + - - [29000, 626, 1, 1024, 29000, 29000, 1024, 1024] + - [46, 20178.0] + - - [29000, 628, 1, 1024, 29000, 29000, 1024, 1024] + - [12, 20324.0] + - - [29000, 636, 1, 1024, 29000, 29000, 1024, 1024] + - [46, 20537.0] + - - [29000, 651, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 18066.0] + - - [29000, 658, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 18307.0] + - - [29000, 669, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 18459.0] + - - [29000, 670, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 18581.0] + - - [29000, 672, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 18531.0] + - - [29000, 684, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 18911.0] + - - [29000, 716, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 19733.0] + - - [29000, 730, 1, 1024, 29000, 29000, 1024, 1024] + - [32, 20050.0] + - - [2560, 1024, 1, 2560, 2560, 2560, 2560, 2560] + - [14, 20324.0] + - - [2560, 1024, 1, 4096, 2560, 2560, 4096, 4096] + - [14, 20925.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 2560, 2560] + - [14, 19204.0] + - - [1024, 1024, 512, 64, 1024, 1024, 64, 64] + - [12, 17661.0] + - - [1024, 32768, 1, 4096, 1024, 1024, 4096, 4096] + - [48, 18872.0] + - - [3072, 32768, 1, 1024, 3072, 3072, 1024, 1024] + - [14, 21391.0] + - - [4096, 32768, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 21315.0] + - - [50304, 32768, 1, 1024, 50304, 50304, 1024, 1024] + - [48, 21688.0] + - - [1024, 1024, 24, 128, 1024, 1024, 128, 128] + - [5, 19520.0] + - - [128, 1024, 24, 1024, 128, 128, 1024, 1024] + - [48, 18466.0] + - - [128, 128, 49, 12800, 128, 128, 12800, 12800] + - [56, 13731.0] + - - [128, 128, 25, 25088, 128, 128, 25088, 25088] + - [54, 13363.0] + - - [128, 128, 49, 25600, 128, 128, 25600, 25600] + - [57, 12513.0] + - - [128, 128, 25, 50176, 128, 128, 50176, 50176] + - [54, 12548.0] + - - [128, 128, 36, 12544, 128, 128, 12544, 12544] + - [57, 13527.0] + - - [128, 128, 49, 9216, 128, 128, 9216, 9216] + - [57, 12430.0] + - - [1024, 1024, 1, 12544, 1024, 1024, 12544, 12544] + - [52, 18711.0] + - - [1024, 1000, 1, 12544, 1024, 1024, 12544, 12544] + - [53, 17637.0] + - - [128, 128, 36, 12000, 128, 128, 12000, 12000] + - [55, 16608.0] + - - [5888, 128, 1, 1, 5888, 5888, 1, 1] + - [67, 209.0] + - - [1856, 256, 1, 1, 1856, 1856, 1, 1] + - [67, 208.0] + - - [256, 1856, 1, 32, 256, 256, 32, 32] + - [73, 3263.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [68, 13219.0] + - - [2944, 128, 1, 32, 2944, 2944, 32, 32] + - [58, 3745.0] + - - [64, 6784, 1, 1, 64, 64, 1, 1] + - [67, 158.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [97, 11936.0] + - - [704, 1024, 1, 1, 704, 704, 1, 1] + - [58, 282.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [68, 13885.0] + - - [256, 1408, 1, 1, 256, 256, 1, 1] + - [73, 81.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [63, 13718.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [77, 13397.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [90, 11278.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [63, 15570.0] + - - [2944, 256, 1, 1, 2944, 2944, 1, 1] + - [86, 154.0] + - - [2944, 256, 1, 32, 2944, 2944, 32, 32] + - [88, 4074.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [83, 13878.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [99, 10152.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [99, 13937.0] + - - [2368, 256, 1, 32, 2368, 2368, 32, 32] + - [94, 3288.0] + - - [5056, 64, 1, 32, 5056, 5056, 32, 32] + - [61, 1817.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [61, 11135.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [76, 13339.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [77, 14132.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [98, 7364.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [90, 11732.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [82, 12946.0] + - - [1024, 448, 1, 32, 1024, 1024, 32, 32] + - [65, 2594.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [75, 11980.0] + - - [256, 1856, 1, 1, 256, 256, 1, 1] + - [90, 123.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [63, 16093.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [98, 14515.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [63, 11527.0] + - - [128, 5888, 1, 32, 128, 128, 32, 32] + - [72, 3733.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [68, 13325.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [91, 12388.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [89, 10343.0] + - - [1024, 704, 1, 1, 1024, 1024, 1, 1] + - [97, 146.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [97, 11659.0] + - - [1024, 704, 1, 32, 1024, 1024, 32, 32] + - [84, 4453.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [83, 11732.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [90, 13661.0] + - - [2944, 448, 1, 1, 2944, 2944, 1, 1] + - [58, 217.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [97, 10814.0] + - - [704, 704, 1, 32, 704, 704, 32, 32] + - [72, 2773.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [76, 12009.0] + - - [5056, 128, 1, 1, 5056, 5056, 1, 1] + - [79, 127.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [75, 10748.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [62, 12805.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [91, 11085.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [77, 14759.0] + - - [128, 6784, 1, 32, 128, 128, 32, 32] + - [58, 4383.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [91, 15176.0] + - - [4288, 128, 1, 1, 4288, 4288, 1, 1] + - [94, 111.0] + - - [256, 2368, 1, 32, 256, 256, 32, 32] + - [87, 3299.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [75, 8317.0] + - - [256, 1408, 1, 32, 256, 256, 32, 32] + - [59, 2031.0] + - - [256, 3584, 1, 32, 256, 256, 32, 32] + - [58, 4720.0] + - - [128, 4288, 1, 32, 128, 128, 32, 32] + - [86, 2744.0] + - - [448, 1856, 1, 1, 448, 448, 1, 1] + - [58, 160.0] + - - [448, 1856, 1, 32, 448, 448, 32, 32] + - [58, 4264.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [77, 14547.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [82, 9020.0] + - - [704, 1856, 1, 32, 704, 704, 32, 32] + - [97, 5333.0] + - - [704, 1408, 1, 32, 704, 704, 32, 32] + - [94, 4748.0] + - - [5888, 128, 1, 32, 5888, 5888, 32, 32] + - [66, 4047.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [69, 13049.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [83, 14044.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [83, 8620.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [99, 14092.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [76, 10454.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [98, 13461.0] + - - [128, 2368, 1, 1, 128, 128, 1, 1] + - [58, 66.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [98, 10541.0] + - - [1024, 448, 1, 1, 1024, 1024, 1, 1] + - [94, 93.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [90, 10925.0] + - - [704, 448, 1, 32, 704, 704, 32, 32] + - [72, 1815.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [62, 9954.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [98, 14163.0] + - - [256, 2368, 1, 1, 256, 256, 1, 1] + - [73, 223.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [82, 11161.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [61, 11722.0] + - - [4288, 256, 1, 1, 4288, 4288, 1, 1] + - [72, 181.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [91, 14524.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [83, 14361.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [63, 13985.0] + - - [3584, 256, 1, 1, 3584, 3584, 1, 1] + - [88, 310.0] + - - [2368, 448, 1, 32, 2368, 2368, 32, 32] + - [79, 4992.0] + - - [1408, 704, 1, 1, 1408, 1408, 1, 1] + - [58, 180.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [69, 10130.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [68, 12777.0] + - - [256, 2944, 1, 1, 256, 256, 1, 1] + - [100, 188.0] + - - [6784, 64, 1, 1, 6784, 6784, 1, 1] + - [82, 163.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [98, 8459.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [69, 11094.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [77, 13238.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [67, 7104.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [77, 14195.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [77, 15417.0] + - - [5888, 64, 1, 1, 5888, 5888, 1, 1] + - [71, 134.0] + - - [256, 3584, 1, 1, 256, 256, 1, 1] + - [95, 172.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [77, 13700.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [61, 12255.0] + - - [128, 5056, 1, 32, 128, 128, 32, 32] + - [90, 4125.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [99, 15155.0] + - - [5888, 64, 1, 32, 5888, 5888, 32, 32] + - [79, 2169.0] + - - [2368, 256, 1, 1, 2368, 2368, 1, 1] + - [91, 167.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [68, 10472.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [67, 7780.0] + - - [5056, 64, 1, 1, 5056, 5056, 1, 1] + - [71, 87.0] + - - [1408, 448, 1, 32, 1408, 1408, 32, 32] + - [86, 5734.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [69, 15992.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [63, 11518.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [76, 12174.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [69, 12494.0] + - - [448, 704, 1, 1, 448, 448, 1, 1] + - [79, 69.0] + - - [448, 704, 1, 32, 448, 448, 32, 32] + - [58, 1862.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [61, 11783.0] + - - [64, 5888, 1, 1, 64, 64, 1, 1] + - [72, 76.0] + - - [2368, 128, 1, 32, 2368, 2368, 32, 32] + - [58, 1738.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [69, 14248.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [83, 13410.0] + - - [4288, 256, 1, 32, 4288, 4288, 32, 32] + - [86, 5004.0] + - - [448, 1408, 1, 1, 448, 448, 1, 1] + - [79, 127.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [97, 7690.0] + - - [256, 4288, 1, 32, 256, 256, 32, 32] + - [86, 5322.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [89, 11242.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [89, 9443.0] + - - [128, 2944, 1, 32, 128, 128, 32, 32] + - [94, 3224.0] + - - [1856, 448, 1, 1, 1856, 1856, 1, 1] + - [82, 208.0] + - - [704, 704, 1, 1, 704, 704, 1, 1] + - [96, 112.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [77, 13891.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [68, 10864.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [91, 16120.0] + - - [448, 2368, 1, 1, 448, 448, 1, 1] + - [94, 185.0] + - - [128, 6784, 1, 1, 128, 128, 1, 1] + - [61, 172.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [62, 13888.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [67, 8660.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [89, 8860.0] + - - [448, 1024, 1, 32, 448, 448, 32, 32] + - [86, 2594.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [62, 10183.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [89, 10742.0] + - - [704, 1856, 1, 1, 704, 704, 1, 1] + - [58, 192.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [67, 7070.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [91, 13678.0] + - - [128, 5056, 1, 1, 128, 128, 1, 1] + - [61, 121.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [91, 14859.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [67, 7794.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [90, 9034.0] + - - [704, 1408, 1, 1, 704, 704, 1, 1] + - [58, 182.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [63, 14101.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [69, 14594.0] + - - [3584, 256, 1, 32, 3584, 3584, 32, 32] + - [72, 4661.0] + - - [1408, 256, 1, 32, 1408, 1408, 32, 32] + - [72, 2082.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [76, 10876.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [69, 16324.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [68, 13399.0] + - - [3584, 128, 1, 1, 3584, 3584, 1, 1] + - [67, 93.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [68, 7211.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [69, 10455.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [68, 8857.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [82, 12552.0] + - - [128, 5888, 1, 1, 128, 128, 1, 1] + - [102, 209.0] + - - [64, 5056, 1, 1, 64, 64, 1, 1] + - [58, 63.0] + - - [1856, 256, 1, 32, 1856, 1856, 32, 32] + - [94, 2677.0] + - - [64, 5056, 1, 32, 64, 64, 32, 32] + - [94, 1692.0] + - - [1408, 704, 1, 32, 1408, 1408, 32, 32] + - [66, 5003.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [91, 13280.0] + - - [1024, 1024, 1, 32, 1024, 1024, 32, 32] + - [98, 6848.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [91, 16647.0] + - - [128, 4288, 1, 1, 128, 128, 1, 1] + - [80, 178.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [82, 11150.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [82, 13365.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [89, 7596.0] + - - [6784, 128, 1, 1, 6784, 6784, 1, 1] + - [74, 275.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [76, 10915.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [82, 10743.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [83, 11952.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [98, 11988.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [77, 13838.0] + - - [704, 448, 1, 1, 704, 704, 1, 1] + - [58, 67.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [61, 11977.0] + - - [2944, 128, 1, 1, 2944, 2944, 1, 1] + - [66, 83.0] + - - [704, 1024, 1, 32, 704, 704, 32, 32] + - [86, 3832.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [91, 15449.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [98, 12563.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [62, 9400.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [62, 13813.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [90, 13682.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [82, 13163.0] + - - [448, 2944, 1, 1, 448, 448, 1, 1] + - [94, 220.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [63, 13808.0] + - - [2368, 448, 1, 1, 2368, 2368, 1, 1] + - [75, 308.0] + - - [448, 2944, 1, 32, 448, 448, 32, 32] + - [94, 8179.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [91, 13813.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [99, 11581.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [77, 14247.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [69, 11547.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [90, 13516.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [77, 14835.0] + - - [1856, 704, 1, 32, 1856, 1856, 32, 32] + - [58, 5333.0] + - - [2944, 448, 1, 32, 2944, 2944, 32, 32] + - [66, 5658.0] + - - [5056, 128, 1, 32, 5056, 5056, 32, 32] + - [86, 3522.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [77, 14972.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [91, 13323.0] + - - [1856, 704, 1, 1, 1856, 1856, 1, 1] + - [94, 216.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [62, 14001.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [83, 13020.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [68, 9418.0] + - - [6784, 128, 1, 32, 6784, 6784, 32, 32] + - [87, 4369.0] + - - [1408, 448, 1, 1, 1408, 1408, 1, 1] + - [88, 132.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [67, 10697.0] + - - [448, 1408, 1, 32, 448, 448, 32, 32] + - [100, 5097.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [68, 10772.0] + - - [1024, 1024, 1, 1, 1024, 1024, 1, 1] + - [79, 186.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [75, 10996.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [75, 10191.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [82, 12631.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [91, 14304.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [61, 11654.0] + - - [2368, 128, 1, 1, 2368, 2368, 1, 1] + - [61, 55.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [67, 7370.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [67, 6724.0] + - - [64, 6784, 1, 32, 64, 64, 32, 32] + - [72, 2157.0] + - - [256, 4288, 1, 1, 256, 256, 1, 1] + - [66, 195.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [81, 7780.0] + - - [3584, 128, 1, 32, 3584, 3584, 32, 32] + - [73, 2488.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [68, 13449.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [99, 12668.0] + - - [128, 2944, 1, 1, 128, 128, 1, 1] + - [77, 76.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [99, 13170.0] + - - [6784, 64, 1, 32, 6784, 6784, 32, 32] + - [94, 2363.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [67, 10349.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [63, 10789.0] + - - [4288, 128, 1, 32, 4288, 4288, 32, 32] + - [58, 3114.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [67, 6984.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [98, 13788.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [61, 11660.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [63, 11712.0] + - - [256, 2944, 1, 32, 256, 256, 32, 32] + - [58, 3980.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [68, 12637.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [63, 15449.0] + - - [128, 2368, 1, 32, 128, 128, 32, 32] + - [87, 1690.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [77, 11574.0] + - - [1856, 448, 1, 32, 1856, 1856, 32, 32] + - [67, 6554.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [90, 10568.0] + - - [448, 1024, 1, 1, 448, 448, 1, 1] + - [79, 96.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [82, 10522.0] + - - [64, 5888, 1, 32, 64, 64, 32, 32] + - [65, 1926.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [91, 10643.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [90, 13870.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [63, 15707.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [97, 9925.0] + - - [1408, 256, 1, 1, 1408, 1408, 1, 1] + - [61, 77.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [89, 11862.0] + - - [128, 3584, 1, 1, 128, 128, 1, 1] + - [66, 88.0] + - - [128, 3584, 1, 32, 128, 128, 32, 32] + - [58, 2353.0] + - - [448, 2368, 1, 32, 448, 448, 32, 32] + - [58, 4906.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [77, 14192.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [76, 13208.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [82, 11038.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [69, 13689.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [98, 11135.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [69, 13201.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [82, 10298.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [68, 13423.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [83, 16730.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 13469.0] + - - [2049, 512, 1, 2048, 2049, 2049, 2048, 2048] + - [69, 13725.0] + - - [1023, 512, 1, 1024, 1023, 1023, 1024, 1024] + - [99, 11829.0] + - - [1024, 512, 1, 1025, 1024, 1024, 1025, 1025] + - [99, 12918.0] + - - [1024, 1024, 1, 1023, 1024, 1024, 1023, 1023] + - [83, 13888.0] + - - [1024, 1025, 1, 1024, 1024, 1024, 1024, 1024] + - [69, 13375.0] + - - [1024, 1023, 1, 1024, 1024, 1024, 1024, 1024] + - [69, 13110.0] + - - [2048, 511, 1, 2048, 2048, 2048, 2048, 2048] + - [69, 13660.0] + - - [2047, 512, 1, 2048, 2047, 2047, 2048, 2048] + - [69, 13712.0] + - - [1025, 1024, 1, 1024, 1025, 1025, 1024, 1024] + - [99, 13207.0] + - - [1024, 1024, 1, 1025, 1024, 1024, 1025, 1025] + - [63, 13496.0] + - - [1025, 512, 1, 1024, 1025, 1025, 1024, 1024] + - [99, 12735.0] + - - [1024, 512, 1, 1023, 1024, 1024, 1023, 1023] + - [77, 13217.0] + - - [2048, 513, 1, 2048, 2048, 2048, 2048, 2048] + - [69, 13641.0] + - - [1024, 511, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 11608.0] + - - [2048, 512, 1, 2047, 2048, 2048, 2047, 2047] + - [77, 13972.0] + - - [1024, 513, 1, 1024, 1024, 1024, 1024, 1024] + - [99, 12083.0] + - - [2048, 512, 1, 2049, 2048, 2048, 2049, 2049] + - [91, 13935.0] + - - [1023, 1024, 1, 1024, 1023, 1023, 1024, 1024] + - [68, 13220.0] + - - [64, 128, 512, 128, 64, 64, 128, 128] + - [100, 8918.0] + - - [64, 512, 64, 512, 64, 64, 512, 512] + - [70, 9020.0] + - - [256, 1280, 1, 1024, 256, 256, 1024, 1024] + - [83, 11865.0] + - - [256, 1536, 1, 1024, 256, 256, 1024, 1024] + - [68, 10663.0] + - - [256, 2304, 1, 1024, 256, 256, 1024, 1024] + - [69, 13827.0] + - - [256, 2560, 1, 1024, 256, 256, 1024, 1024] + - [69, 15128.0] + - - [256, 2816, 1, 1024, 256, 256, 1024, 1024] + - [68, 12470.0] + - - [256, 3328, 1, 1024, 256, 256, 1024, 1024] + - [69, 14062.0] + - - [256, 3584, 1, 1024, 256, 256, 1024, 1024] + - [69, 15105.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [82, 13010.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [69, 12319.0] + - - [1024, 512, 1, 1600, 1024, 1024, 1600, 1600] + - [77, 13111.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [68, 14122.0] + - - [1024, 960, 1, 1600, 1024, 1024, 1600, 1600] + - [90, 14903.0] + - - [2048, 215, 1, 512, 2048, 2048, 512, 512] + - [97, 9187.0] + - - [2048, 215, 1, 768, 2048, 2048, 768, 768] + - [89, 9813.0] + - - [2048, 256, 1, 512, 2048, 2048, 512, 512] + - [69, 11355.0] + - - [2048, 256, 1, 768, 2048, 2048, 768, 768] + - [99, 12056.0] + - - [2048, 512, 1, 67, 2048, 2048, 67, 67] + - [72, 7364.0] + - - [2048, 512, 1, 74, 2048, 2048, 74, 74] + - [86, 8117.0] + - - [2048, 512, 1, 100, 2048, 2048, 100, 100] + - [86, 8887.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [83, 13661.0] + - - [1024, 512, 1, 4096, 1024, 1024, 4096, 4096] + - [83, 13115.0] + - - [30522, 77, 1, 1024, 30522, 30522, 1024, 1024] + - [83, 10247.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 12991.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [69, 13307.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 13486.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [82, 10272.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [99, 11005.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 4096, 4096] + - [69, 14009.0] + - - [480, 1024, 1, 1024, 480, 480, 1024, 1024] + - [69, 11455.0] + - - [480, 2048, 1, 2048, 480, 480, 2048, 2048] + - [98, 12912.0] + - - [1024, 480, 1, 1024, 1024, 1024, 1024, 1024] + - [99, 11225.0] + - - [2048, 480, 1, 2048, 2048, 2048, 2048, 2048] + - [99, 12884.0] + - - [64, 1024, 256, 1024, 64, 64, 1024, 1024] + - [84, 9934.0] + - - [64, 512, 40, 512, 64, 64, 512, 512] + - [84, 8879.0] + - - [96, 1024, 64, 1024, 96, 96, 1024, 1024] + - [69, 13061.0] + - - [64, 1024, 128, 1024, 64, 64, 1024, 1024] + - [70, 9859.0] + - - [64, 1024, 32, 1024, 64, 64, 1024, 1024] + - [84, 9088.0] + - - [64, 512, 128, 512, 64, 64, 512, 512] + - [100, 9521.0] + - - [96, 1024, 128, 1024, 96, 96, 1024, 1024] + - [69, 13389.0] + - - [64, 512, 256, 512, 64, 64, 512, 512] + - [100, 9700.0] + - - [64, 1024, 64, 1024, 64, 64, 1024, 1024] + - [84, 9666.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [69, 13228.0] + - - [64, 128, 1024, 128, 64, 64, 128, 128] + - [89, 9788.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 14985.0] + - - [1024, 864, 1, 480, 1024, 1024, 480, 480] + - [91, 14514.0] + - - [128, 3456, 1, 256, 128, 128, 256, 256] + - [68, 10889.0] + - - [128, 4096, 1, 256, 128, 128, 256, 256] + - [69, 11355.0] + - - [128, 6912, 1, 256, 128, 128, 256, 256] + - [69, 11833.0] + - - [256, 3456, 1, 512, 256, 256, 512, 512] + - [99, 13743.0] + - - [256, 4096, 1, 512, 256, 256, 512, 512] + - [69, 12887.0] + - - [512, 864, 1, 1024, 512, 512, 1024, 1024] + - [98, 11651.0] + - - [512, 864, 1, 13, 512, 512, 13, 13] + - [82, 786.0] + - - [64, 128, 1280, 128, 64, 64, 128, 128] + - [78, 8643.0] + - - [64, 128, 1312, 128, 64, 64, 128, 128] + - [78, 8739.0] + - - [64, 512, 192, 512, 64, 64, 512, 512] + - [100, 9712.0] + - - [1024, 512, 1, 196, 1024, 1024, 196, 196] + - [86, 10317.0] + - - [2048, 512, 1, 49, 2048, 2048, 49, 49] + - [72, 9768.0] + - - [2304, 256, 1, 196, 2304, 2304, 196, 196] + - [88, 12621.0] + - - [512, 1024, 1, 196, 512, 512, 196, 196] + - [59, 10772.0] + - - [512, 2048, 1, 49, 512, 512, 49, 49] + - [79, 9410.0] + - - [64, 128, 2048, 128, 64, 64, 128, 128] + - [64, 9449.0] + - - [64, 128, 1536, 128, 64, 64, 128, 128] + - [75, 10168.0] + - - [128, 128, 64, 6400, 128, 128, 6400, 6400] + - [102, 11282.0] + - - [64, 128, 192, 128, 64, 64, 128, 128] + - [92, 7580.0] + - - [64, 384, 144, 384, 64, 64, 384, 384] + - [75, 13410.0] + - - [64, 512, 48, 512, 64, 64, 512, 512] + - [84, 8589.0] + - - [64, 128, 256, 128, 64, 64, 128, 128] + - [92, 7709.0] + - - [64, 384, 192, 384, 64, 64, 384, 384] + - [75, 10789.0] + - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] + - [69, 12981.0] + - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] + - [98, 13690.0] + - - [128, 128, 49, 1120, 128, 128, 1120, 1120] + - [90, 14304.0] + - - [128, 128, 49, 1064, 128, 128, 1064, 1064] + - [87, 14347.0] + - - [128, 128, 49, 1040, 128, 128, 1040, 1040] + - [90, 14297.0] + - - [128, 128, 64, 600, 128, 128, 600, 600] + - [90, 13113.0] + - - [128, 128, 64, 616, 128, 128, 616, 616] + - [90, 13150.0] + - - [128, 128, 49, 950, 128, 128, 950, 950] + - [76, 14588.0] + - - [128, 128, 49, 972, 128, 128, 972, 972] + - [90, 14070.0] + - - [128, 128, 64, 560, 128, 128, 560, 560] + - [68, 13612.0] + - - [128, 128, 49, 1008, 128, 128, 1008, 1008] + - [63, 14232.0] + - - [128, 128, 64, 532, 128, 128, 532, 532] + - [82, 13282.0] + - - [128, 128, 49, 1080, 128, 128, 1080, 1080] + - [62, 14275.0] + - - [128, 128, 64, 588, 128, 128, 588, 588] + - [91, 12958.0] + - - [128, 128, 49, 1160, 128, 128, 1160, 1160] + - [76, 14305.0] + - - [128, 128, 49, 988, 128, 128, 988, 988] + - [98, 14276.0] + - - [128, 128, 49, 936, 128, 128, 936, 936] + - [82, 14390.0] + - - [512, 1024, 1, 3800, 512, 512, 3800, 3800] + - [63, 13763.0] + - - [512, 1024, 1, 3400, 512, 512, 3400, 3400] + - [77, 13935.0] + - - [512, 1024, 1, 3456, 512, 512, 3456, 3456] + - [63, 13754.0] + - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] + - [69, 13337.0] + - - [2048, 512, 1, 950, 2048, 2048, 950, 950] + - [73, 14166.0] + - - [512, 1024, 1, 3552, 512, 512, 3552, 3552] + - [63, 13811.0] + - - [512, 1024, 1, 3220, 512, 512, 3220, 3220] + - [77, 13674.0] + - - [2048, 512, 1, 850, 2048, 2048, 850, 850] + - [73, 13866.0] + - - [512, 2048, 1, 864, 512, 512, 864, 864] + - [91, 13748.0] + - - [512, 2048, 1, 768, 512, 512, 768, 768] + - [77, 13462.0] + - - [2048, 512, 1, 805, 2048, 2048, 805, 805] + - [73, 13716.0] + - - [512, 1024, 1, 2852, 512, 512, 2852, 2852] + - [77, 13603.0] + - - [512, 2048, 1, 888, 512, 512, 888, 888] + - [87, 14761.0] + - - [2048, 512, 1, 864, 2048, 2048, 864, 864] + - [63, 13698.0] + - - [2048, 512, 1, 768, 2048, 2048, 768, 768] + - [63, 13467.0] + - - [2048, 512, 1, 888, 2048, 2048, 888, 888] + - [59, 14338.0] + - - [2048, 256, 1, 950, 2048, 2048, 950, 950] + - [58, 13052.0] + - - [2048, 512, 1, 713, 2048, 2048, 713, 713] + - [73, 13713.0] + - - [512, 1024, 1, 2688, 512, 512, 2688, 2688] + - [77, 13611.0] + - - [512, 1024, 1, 2640, 512, 512, 2640, 2640] + - [77, 13605.0] + - - [512, 1024, 1, 2904, 512, 512, 2904, 2904] + - [77, 13609.0] + - - [1024, 512, 1, 950, 1024, 1024, 950, 950] + - [77, 12496.0] + - - [512, 2048, 1, 672, 512, 512, 672, 672] + - [87, 14595.0] + - - [512, 2048, 1, 660, 512, 512, 660, 660] + - [87, 14044.0] + - - [512, 2048, 1, 1008, 512, 512, 1008, 1008] + - [59, 14849.0] + - - [2048, 256, 1, 850, 2048, 2048, 850, 850] + - [91, 12290.0] + - - [2048, 512, 1, 726, 2048, 2048, 726, 726] + - [87, 13766.0] + - - [1024, 512, 1, 850, 1024, 1024, 850, 850] + - [91, 12345.0] + - - [2048, 512, 1, 660, 2048, 2048, 660, 660] + - [73, 13914.0] + - - [2048, 512, 1, 672, 2048, 2048, 672, 672] + - [87, 14121.0] + - - [512, 2048, 1, 840, 512, 512, 840, 840] + - [87, 14612.0] + - - [2048, 512, 1, 1008, 2048, 2048, 1008, 1008] + - [87, 14527.0] + - - [512, 2048, 1, 792, 512, 512, 792, 792] + - [69, 13451.0] + - - [1024, 512, 1, 805, 1024, 1024, 805, 805] + - [91, 12226.0] + - - [512, 2048, 1, 1050, 512, 512, 1050, 1050] + - [73, 14112.0] + - - [2048, 512, 1, 748, 2048, 2048, 748, 748] + - [77, 13454.0] + - - [2048, 256, 1, 864, 2048, 2048, 864, 864] + - [72, 12847.0] + - - [1024, 512, 1, 768, 1024, 1024, 768, 768] + - [63, 12194.0] + - - [1024, 512, 1, 864, 1024, 1024, 864, 864] + - [77, 12548.0] + - - [2048, 512, 1, 875, 2048, 2048, 875, 875] + - [87, 13927.0] + - - [2048, 512, 1, 840, 2048, 2048, 840, 840] + - [59, 14327.0] + - - [2048, 512, 1, 792, 2048, 2048, 792, 792] + - [91, 13469.0] + - - [512, 2048, 1, 736, 512, 512, 736, 736] + - [63, 13544.0] + - - [2048, 256, 1, 888, 2048, 2048, 888, 888] + - [60, 12569.0] + - - [512, 2048, 1, 704, 512, 512, 704, 704] + - [59, 14629.0] + - - [512, 2048, 1, 588, 512, 512, 588, 588] + - [87, 14045.0] + - - [1024, 512, 1, 888, 1024, 1024, 888, 888] + - [77, 12442.0] + - - [512, 2048, 1, 816, 512, 512, 816, 816] + - [59, 14444.0] + - - [1024, 512, 1, 713, 1024, 1024, 713, 713] + - [73, 12074.0] + - - [2048, 512, 1, 736, 2048, 2048, 736, 736] + - [63, 13587.0] + - - [2048, 512, 1, 588, 2048, 2048, 588, 588] + - [59, 13605.0] + - - [2048, 512, 1, 704, 2048, 2048, 704, 704] + - [73, 14497.0] + - - [1024, 512, 1, 660, 1024, 1024, 660, 660] + - [91, 11891.0] + - - [2048, 256, 1, 660, 2048, 2048, 660, 660] + - [58, 12501.0] + - - [2048, 256, 1, 672, 2048, 2048, 672, 672] + - [86, 12719.0] + - - [1024, 512, 1, 672, 1024, 1024, 672, 672] + - [91, 12225.0] + - - [1024, 512, 1, 726, 1024, 1024, 726, 726] + - [72, 12799.0] + - - [512, 2048, 1, 630, 512, 512, 630, 630] + - [59, 13814.0] + - - [512, 2048, 1, 600, 512, 512, 600, 600] + - [59, 13542.0] + - - [2048, 256, 1, 805, 2048, 2048, 805, 805] + - [60, 12326.0] + - - [2048, 256, 1, 713, 2048, 2048, 713, 713] + - [58, 12419.0] + - - [2048, 256, 1, 726, 2048, 2048, 726, 726] + - [60, 12169.0] + - - [320, 1024, 1, 1024, 320, 320, 1024, 1024] + - [97, 10389.0] + - - [1024, 1000, 1, 1024, 1024, 1024, 1024, 1024] + - [83, 13026.0] + - - [320, 1000, 1, 1024, 320, 320, 1024, 1024] + - [81, 9918.0] + - - [128, 128, 49, 1280, 128, 128, 1280, 1280] + - [83, 13560.0] + - - [128, 128, 49, 1360, 128, 128, 1360, 1360] + - [90, 14169.0] + - - [128, 128, 49, 1200, 128, 128, 1200, 1200] + - [90, 14163.0] + - - [128, 128, 49, 1240, 128, 128, 1240, 1240] + - [76, 14009.0] + - - [2304, 256, 1, 704, 2304, 2304, 704, 704] + - [74, 13614.0] + - - [2304, 256, 1, 736, 2304, 2304, 736, 736] + - [73, 13755.0] + - - [2304, 256, 1, 792, 2304, 2304, 792, 792] + - [60, 13870.0] + - - [2304, 256, 1, 748, 2304, 2304, 748, 748] + - [77, 13575.0] + - - [2304, 256, 1, 726, 2304, 2304, 726, 726] + - [60, 13585.0] + - - [2304, 256, 1, 713, 2304, 2304, 713, 713] + - [60, 13479.0] + - - [2304, 256, 1, 768, 2304, 2304, 768, 768] + - [83, 13522.0] + - - [512, 2048, 1, 759, 512, 512, 759, 759] + - [73, 14116.0] + - - [512, 2048, 1, 925, 512, 512, 925, 925] + - [87, 14314.0] + - - [2304, 256, 1, 805, 2304, 2304, 805, 805] + - [60, 13644.0] + - - [512, 2048, 1, 900, 512, 512, 900, 900] + - [86, 13809.0] + - - [512, 2048, 1, 875, 512, 512, 875, 875] + - [73, 14435.0] + - - [512, 2048, 1, 748, 512, 512, 748, 748] + - [77, 13417.0] + - - [512, 2048, 1, 726, 512, 512, 726, 726] + - [73, 14391.0] + - - [512, 2048, 1, 713, 512, 512, 713, 713] + - [87, 13938.0] + - - [512, 2048, 1, 805, 512, 512, 805, 805] + - [87, 14139.0] + - - [512, 2048, 1, 850, 512, 512, 850, 850] + - [87, 14329.0] + - - [512, 2048, 1, 950, 512, 512, 950, 950] + - [97, 15842.0] + - - [128, 128, 49, 1152, 128, 128, 1152, 1152] + - [90, 14193.0] + - - [128, 128, 49, 1216, 128, 128, 1216, 1216] + - [62, 14214.0] + - - [128, 128, 36, 1800, 128, 128, 1800, 1800] + - [63, 14941.0] + - - [128, 128, 36, 1900, 128, 128, 1900, 1900] + - [91, 14615.0] + - - [128, 128, 64, 5880, 128, 128, 5880, 5880] + - [91, 14033.0] + - - [128, 128, 49, 7680, 128, 128, 7680, 7680] + - [85, 11112.0] + - - [128, 128, 64, 882, 128, 128, 882, 882] + - [63, 14009.0] + - - [128, 128, 64, 931, 128, 128, 931, 931] + - [63, 14006.0] + - - [128, 64, 121, 1152, 128, 128, 1152, 1152] + - [98, 11631.0] + - - [128, 64, 81, 12000, 128, 128, 12000, 12000] + - [68, 11096.0] + - - [128, 64, 121, 1216, 128, 128, 1216, 1216] + - [62, 13363.0] + - - [128, 64, 81, 1800, 128, 128, 1800, 1800] + - [61, 12594.0] + - - [128, 64, 81, 1900, 128, 128, 1900, 1900] + - [76, 12358.0] + - - [128, 64, 49, 20280, 128, 128, 20280, 20280] + - [62, 10963.0] + - - [128, 64, 49, 3042, 128, 128, 3042, 3042] + - [62, 12138.0] + - - [128, 64, 49, 3211, 128, 128, 3211, 3211] + - [90, 12183.0] + - - [128, 64, 169, 5880, 128, 128, 5880, 5880] + - [101, 10652.0] + - - [128, 64, 121, 7680, 128, 128, 7680, 7680] + - [101, 9047.0] + - - [128, 64, 169, 882, 128, 128, 882, 882] + - [71, 10196.0] + - - [128, 64, 169, 931, 128, 128, 931, 931] + - [93, 10168.0] + - - [256, 128, 25, 1080, 256, 256, 1080, 1080] + - [62, 15113.0] + - - [256, 128, 25, 162, 256, 256, 162, 162] + - [98, 9203.0] + - - [256, 128, 25, 171, 256, 256, 171, 171] + - [74, 10548.0] + - - [1152, 256, 1, 1, 1152, 1152, 1, 1] + - [62, 134.0] + - - [1152, 256, 1, 1444, 1152, 1152, 1444, 1444] + - [63, 11882.0] + - - [1152, 256, 1, 25, 1152, 1152, 25, 25] + - [101, 2333.0] + - - [1152, 256, 1, 9, 1152, 1152, 9, 9] + - [63, 1097.0] + - - [2304, 256, 1, 1444, 2304, 2304, 1444, 1444] + - [77, 14624.0] + - - [2304, 340, 1, 1, 2304, 2304, 1, 1] + - [77, 98.0] + - - [2304, 340, 1, 1444, 2304, 2304, 1444, 1444] + - [77, 13368.0] + - - [2304, 340, 1, 9, 2304, 2304, 9, 9] + - [93, 1330.0] + - - [2304, 510, 1, 25, 2304, 2304, 25, 25] + - [79, 4663.0] + - - [96, 1024, 160, 1024, 96, 96, 1024, 1024] + - [69, 13430.0] + - - [96, 1024, 40, 1024, 96, 96, 1024, 1024] + - [69, 13306.0] + - - [96, 1024, 80, 1024, 96, 96, 1024, 1024] + - [83, 13091.0] + - - [96, 1024, 96, 1024, 96, 96, 1024, 1024] + - [69, 13298.0] + - - [96, 1024, 24, 1024, 96, 96, 1024, 1024] + - [83, 12819.0] + - - [96, 1024, 48, 1024, 96, 96, 1024, 1024] + - [69, 12997.0] + - - [96, 1024, 16, 1024, 96, 96, 1024, 1024] + - [69, 12196.0] + - - [96, 1024, 32, 1024, 96, 96, 1024, 1024] + - [99, 13033.0] + - - [64, 512, 320, 512, 64, 64, 512, 512] + - [100, 9912.0] + - - [64, 512, 80, 512, 64, 64, 512, 512] + - [84, 9287.0] + - - [29000, 109, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 14477.0] + - - [29000, 121, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 15960.0] + - - [29000, 65, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 8780.0] + - - [29000, 66, 1, 2560, 29000, 29000, 2560, 2560] + - [99, 8922.0] + - - [29000, 67, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 9056.0] + - - [29000, 69, 1, 2560, 29000, 29000, 2560, 2560] + - [83, 9317.0] + - - [29000, 70, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 9455.0] + - - [29000, 71, 1, 2560, 29000, 29000, 2560, 2560] + - [99, 9602.0] + - - [29000, 73, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 9860.0] + - - [29000, 74, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 9986.0] + - - [29000, 75, 1, 2560, 29000, 29000, 2560, 2560] + - [83, 10110.0] + - - [29000, 77, 1, 2560, 29000, 29000, 2560, 2560] + - [99, 10393.0] + - - [29000, 78, 1, 2560, 29000, 29000, 2560, 2560] + - [99, 10517.0] + - - [29000, 80, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 10805.0] + - - [29000, 81, 1, 2560, 29000, 29000, 2560, 2560] + - [99, 10911.0] + - - [29000, 82, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 11037.0] + - - [29000, 83, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 11158.0] + - - [29000, 84, 1, 2560, 29000, 29000, 2560, 2560] + - [83, 11296.0] + - - [29000, 88, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 11816.0] + - - [29000, 89, 1, 2560, 29000, 29000, 2560, 2560] + - [83, 11933.0] + - - [29000, 90, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 12067.0] + - - [29000, 92, 1, 2560, 29000, 29000, 2560, 2560] + - [99, 12302.0] + - - [29000, 95, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 12695.0] + - - [29000, 98, 1, 2560, 29000, 29000, 2560, 2560] + - [69, 13091.0] + - - [64, 1024, 512, 1024, 64, 64, 1024, 1024] + - [70, 9951.0] + - - [64, 64, 36, 50176, 64, 64, 50176, 50176] + - [84, 7183.0] + - - [64, 64, 49, 36864, 64, 64, 36864, 36864] + - [112, 6154.0] + - - [64, 64, 64, 25600, 64, 64, 25600, 25600] + - [84, 7715.0] + - - [256, 256, 1, 60800, 256, 256, 60800, 60800] + - [103, 12472.0] + - - [256, 256, 1, 54400, 256, 256, 54400, 54400] + - [107, 12357.0] + - - [256, 256, 1, 51520, 256, 256, 51520, 51520] + - [107, 12350.0] + - - [256, 256, 1, 55296, 256, 256, 55296, 55296] + - [113, 9756.0] + - - [256, 256, 1, 56832, 256, 256, 56832, 56832] + - [106, 11006.0] + - - [256, 256, 1, 45632, 256, 256, 45632, 45632] + - [110, 12244.0] + - - [256, 256, 1, 49152, 256, 256, 49152, 49152] + - [113, 9700.0] + - - [256, 512, 1, 13600, 256, 256, 13600, 13600] + - [107, 12644.0] + - - [256, 256, 1, 43008, 256, 256, 43008, 43008] + - [113, 9634.0] + - - [256, 512, 1, 15200, 256, 256, 15200, 15200] + - [103, 13052.0] + - - [256, 512, 1, 12880, 256, 256, 12880, 12880] + - [107, 12585.0] + - - [256, 512, 1, 13824, 256, 256, 13824, 13824] + - [111, 12104.0] + - - [512, 256, 1, 13824, 512, 512, 13824, 13824] + - [105, 12060.0] + - - [256, 512, 1, 14208, 256, 256, 14208, 14208] + - [108, 12783.0] + - - [512, 256, 1, 14208, 512, 512, 14208, 14208] + - [103, 12722.0] + - - [512, 256, 1, 15200, 512, 512, 15200, 15200] + - [110, 12816.0] + - - [256, 512, 1, 12288, 256, 256, 12288, 12288] + - [108, 11197.0] + - - [512, 256, 1, 12288, 512, 512, 12288, 12288] + - [108, 10626.0] + - - [128, 64, 25, 43320, 128, 128, 43320, 43320] + - [106, 12458.0] + - - [64, 64, 64, 20280, 64, 64, 20280, 20280] + - [104, 9332.0] + - - [64, 64, 49, 27000, 64, 64, 27000, 27000] + - [104, 9253.0] + - - [64, 64, 36, 43320, 64, 64, 43320, 43320] + - [109, 9274.0] + - - [32, 5056, 1, 1280, 32, 32, 1280, 1280] + - [134, 4170.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [119, 5797.0] + - - [2368, 64, 1, 1, 2368, 2368, 1, 1] + - [144, 52.0] + - - [1408, 128, 1, 32, 1408, 1408, 32, 32] + - [143, 1254.0] + - - [32, 2944, 1, 3328, 32, 32, 3328, 3328] + - [118, 3832.0] + - - [2368, 32, 1, 256, 2368, 2368, 256, 256] + - [155, 2395.0] + - - [1024, 128, 1, 32, 1024, 1024, 32, 32] + - [167, 958.0] + - - [32, 4288, 1, 1280, 32, 32, 1280, 1280] + - [118, 3858.0] + - - [32, 5056, 1, 32, 32, 32, 32, 32] + - [180, 1171.0] + - - [5888, 32, 1, 32, 5888, 5888, 32, 32] + - [170, 1256.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [169, 5416.0] + - - [128, 704, 1, 32, 128, 128, 32, 32] + - [118, 680.0] + - - [32, 4288, 1, 3328, 32, 32, 3328, 3328] + - [144, 4129.0] + - - [1408, 64, 1, 1, 1408, 1408, 1, 1] + - [118, 24.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [179, 4154.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [119, 5115.0] + - - [1856, 128, 1, 32, 1856, 1856, 32, 32] + - [145, 1590.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [145, 6354.0] + - - [4288, 32, 1, 3328, 4288, 4288, 3328, 3328] + - [118, 4224.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [181, 6024.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [143, 3386.0] + - - [3584, 64, 1, 32, 3584, 3584, 32, 32] + - [151, 1739.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [143, 5193.0] + - - [32, 6784, 1, 3328, 32, 32, 3328, 3328] + - [144, 4324.0] + - - [32, 3584, 1, 256, 32, 32, 256, 256] + - [128, 2984.0] + - - [704, 256, 1, 32, 704, 704, 32, 32] + - [117, 1232.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [167, 5476.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [181, 5699.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [117, 3800.0] + - - [5056, 32, 1, 3328, 5056, 5056, 3328, 3328] + - [143, 4553.0] + - - [2944, 32, 1, 1280, 2944, 2944, 1280, 1280] + - [180, 3516.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [167, 4053.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [143, 4913.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [155, 4150.0] + - - [1024, 256, 1, 1, 1024, 1024, 1, 1] + - [167, 61.0] + - - [1856, 64, 1, 32, 1856, 1856, 32, 32] + - [140, 841.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [119, 4375.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [179, 2518.0] + - - [6784, 32, 1, 32, 6784, 6784, 32, 32] + - [128, 1384.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [157, 5269.0] + - - [32, 5888, 1, 256, 32, 32, 256, 256] + - [134, 3203.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [153, 3562.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [143, 4737.0] + - - [128, 1408, 1, 1, 128, 128, 1, 1] + - [138, 49.0] + - - [32, 2368, 1, 1280, 32, 32, 1280, 1280] + - [159, 4079.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [177, 3234.0] + - - [2944, 32, 1, 32, 2944, 2944, 32, 32] + - [170, 1370.0] + - - [448, 448, 1, 32, 448, 448, 32, 32] + - [121, 2261.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [143, 5156.0] + - - [64, 2944, 1, 1, 64, 64, 1, 1] + - [179, 46.0] + - - [64, 2944, 1, 32, 64, 64, 32, 32] + - [130, 1299.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [117, 5200.0] + - - [32, 3584, 1, 1280, 32, 32, 1280, 1280] + - [134, 3700.0] + - - [32, 2944, 1, 32, 32, 32, 32, 32] + - [114, 704.0] + - - [32, 6784, 1, 256, 32, 32, 256, 256] + - [131, 3224.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [145, 5552.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [165, 3840.0] + - - [32, 3584, 1, 3328, 32, 32, 3328, 3328] + - [144, 3996.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [167, 3972.0] + - - [64, 4288, 1, 1, 64, 64, 1, 1] + - [119, 67.0] + - - [3584, 32, 1, 32, 3584, 3584, 32, 32] + - [116, 842.0] + - - [3584, 64, 1, 1, 3584, 3584, 1, 1] + - [145, 54.0] + - - [32, 4288, 1, 32, 32, 32, 32, 32] + - [115, 1012.0] + - - [64, 1408, 1, 1, 64, 64, 1, 1] + - [114, 23.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [177, 4444.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [143, 4894.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [167, 5626.0] + - - [64, 3584, 1, 1, 64, 64, 1, 1] + - [155, 56.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [157, 5124.0] + - - [2368, 32, 1, 32, 2368, 2368, 32, 32] + - [168, 1276.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [167, 3776.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [167, 5511.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [145, 3781.0] + - - [32, 2368, 1, 3328, 32, 32, 3328, 3328] + - [159, 3823.0] + - - [128, 1856, 1, 1, 128, 128, 1, 1] + - [145, 59.0] + - - [128, 1856, 1, 32, 128, 128, 32, 32] + - [157, 1571.0] + - - [3584, 32, 1, 256, 3584, 3584, 256, 256] + - [177, 2890.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [181, 4415.0] + - - [32, 2944, 1, 1280, 32, 32, 1280, 1280] + - [171, 4261.0] + - - [4288, 32, 1, 32, 4288, 4288, 32, 32] + - [117, 967.0] + - - [1856, 64, 1, 1, 1856, 1856, 1, 1] + - [166, 30.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [157, 5202.0] + - - [1408, 128, 1, 1, 1408, 1408, 1, 1] + - [179, 46.0] + - - [5056, 32, 1, 256, 5056, 5056, 256, 256] + - [130, 3406.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [143, 3752.0] + - - [3584, 32, 1, 1280, 3584, 3584, 1280, 1280] + - [130, 3886.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [145, 6386.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [167, 5015.0] + - - [1856, 128, 1, 1, 1856, 1856, 1, 1] + - [119, 61.0] + - - [256, 704, 1, 1, 256, 256, 1, 1] + - [118, 43.0] + - - [1024, 128, 1, 1, 1024, 1024, 1, 1] + - [116, 32.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [181, 4643.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [179, 5410.0] + - - [64, 2368, 1, 32, 64, 64, 32, 32] + - [118, 1107.0] + - - [32, 2368, 1, 256, 32, 32, 256, 256] + - [147, 2895.0] + - - [32, 6784, 1, 1280, 32, 32, 1280, 1280] + - [147, 3914.0] + - - [32, 6784, 1, 32, 32, 32, 32, 32] + - [141, 1352.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [181, 6113.0] + - - [32, 5888, 1, 1280, 32, 32, 1280, 1280] + - [159, 4047.0] + - - [448, 256, 1, 1, 448, 448, 1, 1] + - [173, 58.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [143, 5326.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [115, 4410.0] + - - [2368, 32, 1, 3328, 2368, 2368, 3328, 3328] + - [117, 3986.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [167, 5253.0] + - - [128, 1024, 1, 32, 128, 128, 32, 32] + - [131, 975.0] + - - [32, 2368, 1, 32, 32, 32, 32, 32] + - [176, 1144.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [167, 5354.0] + - - [32, 3584, 1, 32, 32, 32, 32, 32] + - [154, 823.0] + - - [704, 256, 1, 1, 704, 704, 1, 1] + - [174, 54.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [157, 5575.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [132, 6126.0] + - - [448, 256, 1, 32, 448, 448, 32, 32] + - [168, 878.0] + - - [64, 4288, 1, 32, 64, 64, 32, 32] + - [181, 2987.0] + - - [128, 704, 1, 1, 128, 128, 1, 1] + - [172, 54.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [119, 5738.0] + - - [448, 448, 1, 1, 448, 448, 1, 1] + - [119, 115.0] + - - [32, 5888, 1, 32, 32, 32, 32, 32] + - [141, 1261.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [145, 5219.0] + - - [4288, 64, 1, 32, 4288, 4288, 32, 32] + - [145, 1756.0] + - - [2368, 64, 1, 32, 2368, 2368, 32, 32] + - [117, 1128.0] + - - [64, 1408, 1, 32, 64, 64, 32, 32] + - [118, 693.0] + - - [32, 2944, 1, 256, 32, 32, 256, 256] + - [153, 2633.0] + - - [2944, 64, 1, 1, 2944, 2944, 1, 1] + - [181, 94.0] + - - [2944, 64, 1, 32, 2944, 2944, 32, 32] + - [181, 2301.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [117, 4963.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [145, 4911.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [141, 4157.0] + - - [6784, 32, 1, 1280, 6784, 6784, 1280, 1280] + - [130, 4118.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [143, 4878.0] + - - [2944, 32, 1, 256, 2944, 2944, 256, 256] + - [128, 2560.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [132, 5522.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [171, 6696.0] + - - [5888, 32, 1, 256, 5888, 5888, 256, 256] + - [155, 3480.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [128, 2701.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [132, 4382.0] + - - [64, 1856, 1, 32, 64, 64, 32, 32] + - [114, 1744.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [167, 5378.0] + - - [5888, 32, 1, 1280, 5888, 5888, 1280, 1280] + - [130, 4212.0] + - - [256, 704, 1, 32, 256, 256, 32, 32] + - [136, 1897.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [117, 5081.0] + - - [1408, 64, 1, 32, 1408, 1408, 32, 32] + - [176, 690.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [143, 5054.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [132, 6363.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [119, 5827.0] + - - [32, 5056, 1, 3328, 32, 32, 3328, 3328] + - [118, 4437.0] + - - [64, 1856, 1, 1, 64, 64, 1, 1] + - [119, 29.0] + - - [704, 128, 1, 32, 704, 704, 32, 32] + - [114, 717.0] + - - [4288, 64, 1, 1, 4288, 4288, 1, 1] + - [179, 66.0] + - - [5056, 32, 1, 1280, 5056, 5056, 1280, 1280] + - [179, 4233.0] + - - [128, 1024, 1, 1, 128, 128, 1, 1] + - [180, 34.0] + - - [256, 1024, 1, 1, 256, 256, 1, 1] + - [167, 62.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [147, 4073.0] + - - [1024, 256, 1, 32, 1024, 1024, 32, 32] + - [165, 2954.0] + - - [2368, 32, 1, 1280, 2368, 2368, 1280, 1280] + - [167, 3870.0] + - - [704, 128, 1, 1, 704, 704, 1, 1] + - [180, 52.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [143, 4330.0] + - - [32, 4288, 1, 256, 32, 32, 256, 256] + - [177, 2922.0] + - - [128, 1408, 1, 32, 128, 128, 32, 32] + - [167, 1232.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [141, 4393.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [155, 4475.0] + - - [4288, 32, 1, 1280, 4288, 4288, 1280, 1280] + - [168, 3855.0] + - - [32, 5056, 1, 256, 32, 32, 256, 256] + - [159, 3114.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [144, 3258.0] + - - [256, 1024, 1, 32, 256, 256, 32, 32] + - [143, 2975.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [119, 5619.0] + - - [6784, 32, 1, 256, 6784, 6784, 256, 256] + - [130, 3517.0] + - - [64, 2368, 1, 1, 64, 64, 1, 1] + - [118, 90.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [181, 4369.0] + - - [5888, 32, 1, 3328, 5888, 5888, 3328, 3328] + - [117, 4446.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [169, 5972.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [117, 5284.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [167, 5339.0] + - - [2944, 32, 1, 3328, 2944, 2944, 3328, 3328] + - [171, 4648.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [132, 5431.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [169, 4869.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [181, 3342.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [157, 6098.0] + - - [256, 448, 1, 1, 256, 256, 1, 1] + - [119, 29.0] + - - [256, 448, 1, 32, 256, 256, 32, 32] + - [168, 838.0] + - - [64, 3584, 1, 32, 64, 64, 32, 32] + - [119, 1562.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [132, 5793.0] + - - [4288, 32, 1, 256, 4288, 4288, 256, 256] + - [130, 3044.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [155, 4845.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [157, 4700.0] + - - [3584, 32, 1, 3328, 3584, 3584, 3328, 3328] + - [117, 4347.0] + - - [6784, 32, 1, 3328, 6784, 6784, 3328, 3328] + - [143, 4418.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [155, 3822.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [177, 2827.0] + - - [5056, 32, 1, 32, 5056, 5056, 32, 32] + - [117, 1092.0] + - - [32, 5888, 1, 3328, 32, 32, 3328, 3328] + - [147, 4199.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [177, 3813.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [165, 4110.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [165, 4137.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [141, 5123.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [183, 4019.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [130, 3969.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [181, 4898.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [157, 5341.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [132, 5432.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [159, 4271.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [181, 5944.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [145, 6024.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [183, 4295.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [134, 4073.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [183, 3715.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [132, 4644.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [143, 4562.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [130, 4863.0] + - - [511, 512, 1, 512, 511, 511, 512, 512] + - [132, 4836.0] + - - [512, 512, 1, 511, 512, 512, 511, 511] + - [143, 5002.0] + - - [512, 513, 1, 512, 512, 512, 512, 512] + - [130, 4776.0] + - - [512, 511, 1, 512, 512, 512, 512, 512] + - [132, 4819.0] + - - [513, 512, 1, 512, 513, 513, 512, 512] + - [132, 4275.0] + - - [512, 512, 1, 513, 512, 512, 513, 513] + - [169, 4919.0] + - - [512, 512, 1, 64, 512, 512, 64, 64] + - [167, 2605.0] + - - [33, 33, 1600, 32, 33, 33, 32, 32] + - [164, 1917.0] + - - [256, 684, 1, 1024, 256, 256, 1024, 1024] + - [181, 4458.0] + - - [1024, 200, 1, 560, 1024, 1024, 560, 560] + - [115, 4625.0] + - - [2048, 114, 1, 512, 2048, 2048, 512, 512] + - [132, 4363.0] + - - [2048, 114, 1, 768, 2048, 2048, 768, 768] + - [132, 4558.0] + - - [32, 32, 4608, 64, 32, 32, 64, 64] + - [144, 4669.0] + - - [32, 35, 4608, 64, 32, 32, 64, 64] + - [177, 3478.0] + - - [34, 34, 4736, 64, 34, 34, 64, 64] + - [177, 2249.0] + - - [35, 35, 4608, 64, 35, 35, 64, 64] + - [177, 2292.0] + - - [33, 33, 1920, 64, 33, 33, 64, 64] + - [115, 2164.0] + - - [480, 512, 1, 512, 480, 480, 512, 512] + - [132, 4556.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [130, 5070.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [179, 4603.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [132, 6254.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [155, 5075.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [132, 4805.0] + - - [128, 864, 1, 256, 128, 128, 256, 256] + - [177, 3104.0] + - - [256, 864, 1, 512, 256, 256, 512, 512] + - [132, 5111.0] + - - [1152, 128, 1, 784, 1152, 1152, 784, 784] + - [119, 5120.0] + - - [256, 512, 1, 784, 256, 256, 784, 784] + - [145, 4658.0] + - - [512, 256, 1, 784, 512, 512, 784, 784] + - [141, 4679.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [132, 4587.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [177, 4004.0] + - - [1024, 256, 1, 3800, 1024, 1024, 3800, 3800] + - [165, 5598.0] + - - [1024, 256, 1, 3400, 1024, 1024, 3400, 3400] + - [115, 5742.0] + - - [256, 1024, 1, 3400, 256, 256, 3400, 3400] + - [165, 5809.0] + - - [1024, 256, 1, 3220, 1024, 1024, 3220, 3220] + - [147, 7073.0] + - - [256, 1024, 1, 3220, 256, 256, 3220, 3220] + - [141, 5765.0] + - - [1024, 256, 1, 3456, 1024, 1024, 3456, 3456] + - [165, 5801.0] + - - [256, 1024, 1, 3456, 256, 256, 3456, 3456] + - [141, 5843.0] + - - [1024, 256, 1, 3072, 1024, 1024, 3072, 3072] + - [132, 5374.0] + - - [256, 1024, 1, 3072, 256, 256, 3072, 3072] + - [132, 5449.0] + - - [1024, 256, 1, 3552, 1024, 1024, 3552, 3552] + - [169, 5569.0] + - - [256, 1024, 1, 3552, 256, 256, 3552, 3552] + - [117, 5722.0] + - - [256, 1024, 1, 2852, 256, 256, 2852, 2852] + - [143, 5622.0] + - - [1024, 256, 1, 2852, 1024, 1024, 2852, 2852] + - [115, 5541.0] + - - [256, 512, 1, 10752, 256, 256, 10752, 10752] + - [132, 4965.0] + - - [256, 1024, 1, 3800, 256, 256, 3800, 3800] + - [141, 5845.0] + - - [256, 512, 1, 10560, 256, 256, 10560, 10560] + - [141, 6661.0] + - - [256, 1024, 1, 2992, 256, 256, 2992, 2992] + - [143, 5619.0] + - - [256, 1024, 1, 2688, 256, 256, 2688, 2688] + - [141, 5897.0] + - - [1024, 256, 1, 2688, 1024, 1024, 2688, 2688] + - [165, 5897.0] + - - [256, 1024, 1, 2904, 256, 256, 2904, 2904] + - [141, 5830.0] + - - [1024, 256, 1, 2904, 1024, 1024, 2904, 2904] + - [115, 5655.0] + - - [256, 1024, 1, 2640, 256, 256, 2640, 2640] + - [165, 5814.0] + - - [1024, 256, 1, 2640, 1024, 1024, 2640, 2640] + - [115, 5791.0] + - - [1024, 256, 1, 4032, 1024, 1024, 4032, 4032] + - [153, 5939.0] + - - [1024, 256, 1, 2992, 1024, 1024, 2992, 2992] + - [115, 5832.0] + - - [256, 1024, 1, 3360, 256, 256, 3360, 3360] + - [117, 5719.0] + - - [1024, 256, 1, 3360, 1024, 1024, 3360, 3360] + - [115, 5737.0] + - - [1024, 256, 1, 3500, 1024, 1024, 3500, 3500] + - [177, 5630.0] + - - [256, 1024, 1, 3500, 256, 256, 3500, 3500] + - [165, 5786.0] + - - [1024, 256, 1, 3168, 1024, 1024, 3168, 3168] + - [167, 5676.0] + - - [256, 1024, 1, 3168, 256, 256, 3168, 3168] + - [165, 5768.0] + - - [256, 1024, 1, 3036, 256, 256, 3036, 3036] + - [141, 5843.0] + - - [1024, 256, 1, 4200, 1024, 1024, 4200, 4200] + - [177, 5684.0] + - - [1024, 256, 1, 3600, 1024, 1024, 3600, 3600] + - [167, 5612.0] + - - [256, 1024, 1, 3600, 256, 256, 3600, 3600] + - [143, 5663.0] + - - [256, 1024, 1, 2944, 256, 256, 2944, 2944] + - [134, 7198.0] + - - [1024, 256, 1, 2944, 1024, 1024, 2944, 2944] + - [165, 5821.0] + - - [1024, 256, 1, 3700, 1024, 1024, 3700, 3700] + - [177, 5771.0] + - - [256, 1024, 1, 2352, 256, 256, 2352, 2352] + - [165, 5707.0] + - - [1024, 256, 1, 2352, 1024, 1024, 2352, 2352] + - [165, 5550.0] + - - [1024, 256, 1, 2816, 1024, 1024, 2816, 2816] + - [169, 5525.0] + - - [256, 1024, 1, 3700, 256, 256, 3700, 3700] + - [141, 5877.0] + - - [256, 1024, 1, 2816, 256, 256, 2816, 2816] + - [119, 5511.0] + - - [256, 512, 1, 11408, 256, 256, 11408, 11408] + - [119, 5543.0] + - - [1024, 256, 1, 3036, 1024, 1024, 3036, 3036] + - [177, 5686.0] + - - [1024, 256, 1, 3264, 1024, 1024, 3264, 3264] + - [167, 5731.0] + - - [256, 1024, 1, 3264, 256, 256, 3264, 3264] + - [117, 5823.0] + - - [1024, 256, 1, 3864, 1024, 1024, 3864, 3864] + - [115, 5741.0] + - - [256, 1024, 1, 4032, 256, 256, 4032, 4032] + - [165, 5927.0] + - - [1024, 256, 1, 3128, 1024, 1024, 3128, 3128] + - [115, 5696.0] + - - [256, 1024, 1, 3128, 256, 256, 3128, 3128] + - [141, 5637.0] + - - [256, 1024, 1, 3200, 256, 256, 3200, 3200] + - [141, 5825.0] + - - [256, 512, 1, 11616, 256, 256, 11616, 11616] + - [128, 5493.0] + - - [1024, 256, 1, 3200, 1024, 1024, 3200, 3200] + - [167, 5725.0] + - - [1024, 256, 1, 4000, 1024, 1024, 4000, 4000] + - [141, 5920.0] + - - [256, 1024, 1, 2520, 256, 256, 2520, 2520] + - [141, 5824.0] + - - [1024, 256, 1, 2520, 1024, 1024, 2520, 2520] + - [115, 5565.0] + - - [256, 1024, 1, 2976, 256, 256, 2976, 2976] + - [143, 5693.0] + - - [256, 1024, 1, 2400, 256, 256, 2400, 2400] + - [141, 5841.0] + - - [1024, 256, 1, 2400, 1024, 1024, 2400, 2400] + - [143, 5630.0] + - - [1024, 256, 1, 3696, 1024, 1024, 3696, 3696] + - [115, 5881.0] + - - [1024, 256, 1, 3900, 1024, 1024, 3900, 3900] + - [141, 5672.0] + - - [1024, 256, 1, 3772, 1024, 1024, 3772, 3772] + - [153, 5823.0] + - - [256, 1024, 1, 3696, 256, 256, 3696, 3696] + - [143, 5669.0] + - - [256, 1024, 1, 2728, 256, 256, 2728, 2728] + - [141, 5859.0] + - - [1024, 256, 1, 2728, 1024, 1024, 2728, 2728] + - [177, 5649.0] + - - [1024, 256, 1, 2480, 1024, 1024, 2480, 2480] + - [115, 5806.0] + - - [256, 1024, 1, 2480, 256, 256, 2480, 2480] + - [165, 5812.0] + - - [1024, 256, 1, 2880, 1024, 1024, 2880, 2880] + - [167, 5719.0] + - - [512, 256, 1, 3220, 512, 512, 3220, 3220] + - [165, 6479.0] + - - [256, 1024, 1, 2880, 256, 256, 2880, 2880] + - [117, 5721.0] + - - [256, 1024, 1, 4200, 256, 256, 4200, 4200] + - [128, 5785.0] + - - [1024, 256, 1, 3648, 1024, 1024, 3648, 3648] + - [167, 5740.0] + - - [1024, 256, 1, 3312, 1024, 1024, 3312, 3312] + - [167, 5587.0] + - - [256, 1024, 1, 3648, 256, 256, 3648, 3648] + - [117, 5749.0] + - - [1024, 256, 1, 3300, 1024, 1024, 3300, 3300] + - [165, 5564.0] + - - [1024, 256, 1, 3528, 1024, 1024, 3528, 3528] + - [165, 5653.0] + - - [256, 1024, 1, 2604, 256, 256, 2604, 2604] + - [141, 5815.0] + - - [1024, 256, 1, 2604, 1024, 1024, 2604, 2604] + - [177, 5837.0] + - - [512, 256, 1, 11408, 512, 512, 11408, 11408] + - [145, 5416.0] + - - [256, 1024, 1, 3312, 256, 256, 3312, 3312] + - [141, 5735.0] + - - [256, 1024, 1, 3300, 256, 256, 3300, 3300] + - [165, 5632.0] + - - [512, 256, 1, 3072, 512, 512, 3072, 3072] + - [181, 4921.0] + - - [256, 1024, 1, 3528, 256, 256, 3528, 3528] + - [115, 5594.0] + - - [1024, 256, 1, 2976, 1024, 1024, 2976, 2976] + - [143, 5751.0] + - - [1024, 256, 1, 2760, 1024, 1024, 2760, 2760] + - [165, 5561.0] + - - [512, 256, 1, 3800, 512, 512, 3800, 3800] + - [119, 5353.0] + - - [256, 1024, 1, 2760, 256, 256, 2760, 2760] + - [141, 6289.0] + - - [1024, 256, 1, 2160, 1024, 1024, 2160, 2160] + - [115, 5780.0] + - - [256, 1024, 1, 2160, 256, 256, 2160, 2160] + - [141, 5848.0] + - - [512, 256, 1, 11616, 512, 512, 11616, 11616] + - [141, 5499.0] + - - [512, 256, 1, 2852, 512, 512, 2852, 2852] + - [119, 5274.0] + - - [256, 1024, 1, 3864, 256, 256, 3864, 3864] + - [141, 5855.0] + - - [512, 256, 1, 2640, 512, 512, 2640, 2640] + - [119, 5291.0] + - - [256, 1024, 1, 4000, 256, 256, 4000, 4000] + - [165, 5891.0] + - - [512, 256, 1, 2904, 512, 512, 2904, 2904] + - [119, 5266.0] + - - [256, 1024, 1, 3900, 256, 256, 3900, 3900] + - [141, 5720.0] + - - [512, 256, 1, 2688, 512, 512, 2688, 2688] + - [141, 5335.0] + - - [256, 1024, 1, 3772, 256, 256, 3772, 3772] + - [128, 5871.0] + - - [512, 256, 1, 3400, 512, 512, 3400, 3400] + - [153, 5280.0] + - - [512, 256, 1, 3456, 512, 512, 3456, 3456] + - [141, 5557.0] + - - [512, 256, 1, 3552, 512, 512, 3552, 3552] + - [115, 5439.0] + - - [128, 64, 25, 6498, 128, 128, 6498, 6498] + - [167, 5877.0] + - - [128, 64, 25, 6859, 128, 128, 6859, 6859] + - [143, 4396.0] + - - [64, 64, 64, 3042, 64, 64, 3042, 3042] + - [141, 5872.0] + - - [64, 64, 64, 3211, 64, 64, 3211, 3211] + - [155, 5427.0] + - - [64, 64, 49, 4050, 64, 64, 4050, 4050] + - [179, 5871.0] + - - [64, 64, 49, 4275, 64, 64, 4275, 4275] + - [143, 5856.0] + - - [64, 64, 36, 6498, 64, 64, 6498, 6498] + - [145, 5736.0] + - - [64, 64, 36, 6859, 64, 64, 6859, 6859] + - [132, 5950.0] + - - [1152, 128, 1, 1444, 1152, 1152, 1444, 1444] + - [169, 5342.0] + - - [512, 256, 1, 361, 512, 512, 361, 361] + - [119, 3834.0] + - - [576, 128, 1, 1444, 576, 576, 1444, 1444] + - [171, 3746.0] + - - [29000, 35, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 3916.0] + - - [29000, 36, 1, 2560, 29000, 29000, 2560, 2560] + - [181, 4020.0] + - - [29000, 39, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 4345.0] + - - [29000, 40, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 4420.0] + - - [29000, 42, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 4570.0] + - - [29000, 43, 1, 2560, 29000, 29000, 2560, 2560] + - [181, 4713.0] + - - [29000, 44, 1, 2560, 29000, 29000, 2560, 2560] + - [181, 4755.0] + - - [29000, 46, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 4911.0] + - - [29000, 48, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5118.0] + - - [29000, 49, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5084.0] + - - [29000, 50, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5176.0] + - - [29000, 51, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5322.0] + - - [29000, 53, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5459.0] + - - [29000, 54, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5568.0] + - - [29000, 55, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5545.0] + - - [29000, 56, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5728.0] + - - [29000, 57, 1, 2560, 29000, 29000, 2560, 2560] + - [181, 5686.0] + - - [29000, 58, 1, 2560, 29000, 29000, 2560, 2560] + - [132, 5851.0] + - - [29000, 59, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5971.0] + - - [29000, 61, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5990.0] + - - [29000, 63, 1, 2560, 29000, 29000, 2560, 2560] + - [181, 6140.0] + - - [256, 128, 1, 13600, 256, 256, 13600, 13600] + - [191, 5733.0] + - - [256, 128, 1, 12880, 256, 256, 12880, 12880] + - [187, 5761.0] + - - [128, 512, 1, 15200, 128, 128, 15200, 15200] + - [141, 5487.0] + - - [512, 128, 1, 15200, 512, 512, 15200, 15200] + - [196, 5201.0] + - - [128, 512, 1, 11408, 128, 128, 11408, 11408] + - [187, 5256.0] + - - [256, 128, 1, 13824, 256, 256, 13824, 13824] + - [207, 4888.0] + - - [128, 512, 1, 11616, 128, 128, 11616, 11616] + - [192, 5269.0] + - - [256, 128, 1, 14208, 256, 256, 14208, 14208] + - [197, 4983.0] + - - [128, 512, 1, 14208, 128, 128, 14208, 14208] + - [196, 5287.0] + - - [256, 128, 1, 15200, 256, 256, 15200, 15200] + - [205, 5077.0] + - - [512, 128, 1, 11408, 512, 512, 11408, 11408] + - [203, 5205.0] + - - [512, 128, 1, 16800, 512, 512, 16800, 16800] + - [203, 5813.0] + - - [128, 512, 1, 11264, 128, 128, 11264, 11264] + - [205, 5031.0] + - - [512, 128, 1, 11616, 512, 512, 11616, 11616] + - [195, 5234.0] + - - [512, 128, 1, 16128, 512, 512, 16128, 16128] + - [193, 5164.0] + - - [512, 128, 1, 11968, 512, 512, 11968, 11968] + - [128, 5464.0] + - - [128, 512, 1, 11968, 128, 128, 11968, 11968] + - [204, 5277.0] + - - [512, 128, 1, 12288, 512, 512, 12288, 12288] + - [207, 4698.0] + - - [128, 512, 1, 12288, 128, 128, 12288, 12288] + - [201, 4977.0] + - - [128, 512, 1, 12672, 128, 128, 12672, 12672] + - [177, 5478.0] + - - [512, 128, 1, 11776, 512, 512, 11776, 11776] + - [197, 4841.0] + - - [512, 128, 1, 12144, 512, 512, 12144, 12144] + - [203, 5237.0] + - - [512, 128, 1, 11264, 512, 512, 11264, 11264] + - [197, 4663.0] + - - [128, 512, 1, 12144, 128, 128, 12144, 12144] + - [187, 5390.0] + - - [512, 128, 1, 12672, 512, 512, 12672, 12672] + - [141, 5243.0] + - - [128, 512, 1, 12512, 128, 128, 12512, 12512] + - [165, 5601.0] + - - [128, 512, 1, 11776, 128, 128, 11776, 11776] + - [197, 5067.0] + - - [256, 128, 1, 12288, 256, 256, 12288, 12288] + - [201, 4854.0] + - - [40, 40, 1, 1909283, 40, 40, 1909283, 1909283] + - [209, 778.0] + - - [40, 40, 1, 3818566, 40, 40, 3818566, 3818566] + - [202, 751.0] + - - [5888, 1, 1, 3328, 5888, 5888, 3328, 3328] + - [214, 251.0] + - - [5056, 1, 1, 3328, 5056, 5056, 3328, 3328] + - [214, 245.0] + - - [6784, 1, 1, 1280, 6784, 6784, 1280, 1280] + - [221, 247.0] + - - [2944, 1, 1, 3328, 2944, 2944, 3328, 3328] + - [217, 254.0] + - - [3584, 1, 1, 1280, 3584, 3584, 1280, 1280] + - [221, 223.0] + - - [6784, 1, 1, 256, 6784, 6784, 256, 256] + - [219, 198.0] + - - [4288, 1, 1, 1280, 4288, 4288, 1280, 1280] + - [214, 226.0] + - - [5056, 1, 1, 1280, 5056, 5056, 1280, 1280] + - [219, 225.0] + - - [3584, 1, 1, 256, 3584, 3584, 256, 256] + - [219, 177.0] + - - [6784, 1, 1, 3328, 6784, 6784, 3328, 3328] + - [217, 236.0] + - - [1408, 1, 1, 1280, 1408, 1408, 1280, 1280] + - [219, 151.0] + - - [1408, 32, 1, 3328, 1408, 1408, 3328, 3328] + - [178, 3030.0] + - - [4288, 1, 1, 256, 4288, 4288, 256, 256] + - [213, 182.0] + - - [2368, 1, 1, 256, 2368, 2368, 256, 256] + - [214, 142.0] + - - [1856, 32, 1, 32, 1856, 1856, 32, 32] + - [166, 1056.0] + - - [5056, 1, 1, 256, 5056, 5056, 256, 256] + - [221, 198.0] + - - [5056, 1, 1, 1, 5056, 5056, 1, 1] + - [116, 3.0] + - - [1408, 1, 1, 256, 1408, 1408, 256, 256] + - [184, 86.0] + - - [1408, 1, 1, 1, 1408, 1408, 1, 1] + - [116, 1.0] + - - [4288, 1, 1, 3328, 4288, 4288, 3328, 3328] + - [219, 331.0] + - - [2368, 1, 1, 1280, 2368, 2368, 1280, 1280] + - [214, 210.0] + - - [1856, 1, 1, 1, 1856, 1856, 1, 1] + - [212, 1.0] + - - [1856, 32, 1, 256, 1856, 1856, 256, 256] + - [218, 2501.0] + - - [1408, 32, 1, 32, 1408, 1408, 32, 32] + - [184, 775.0] + - - [1856, 32, 1, 1280, 1856, 1856, 1280, 1280] + - [154, 3260.0] + - - [1408, 1, 1, 3328, 1408, 1408, 3328, 3328] + - [211, 167.0] + - - [5888, 1, 1, 256, 5888, 5888, 256, 256] + - [220, 193.0] + - - [5888, 1, 1, 1, 5888, 5888, 1, 1] + - [116, 4.0] + - - [1856, 32, 1, 3328, 1856, 1856, 3328, 3328] + - [142, 3332.0] + - - [2368, 1, 1, 3328, 2368, 2368, 3328, 3328] + - [219, 212.0] + - - [6784, 1, 1, 1, 6784, 6784, 1, 1] + - [210, 2.0] + - - [5888, 1, 1, 1280, 5888, 5888, 1280, 1280] + - [220, 221.0] + - - [2944, 1, 1, 256, 2944, 2944, 256, 256] + - [213, 112.0] + - - [2944, 1, 1, 1, 2944, 2944, 1, 1] + - [210, 1.0] + - - [1408, 32, 1, 1280, 1408, 1408, 1280, 1280] + - [178, 2715.0] + - - [1856, 1, 1, 1280, 1856, 1856, 1280, 1280] + - [214, 155.0] + - - [3584, 1, 1, 1, 3584, 3584, 1, 1] + - [215, 2.0] + - - [2944, 1, 1, 1280, 2944, 2944, 1280, 1280] + - [221, 221.0] + - - [3584, 1, 1, 3328, 3584, 3584, 3328, 3328] + - [219, 230.0] + - - [1856, 1, 1, 3328, 1856, 1856, 3328, 3328] + - [211, 183.0] + - - [4288, 1, 1, 1, 4288, 4288, 1, 1] + - [210, 1.0] + - - [1856, 1, 1, 256, 1856, 1856, 256, 256] + - [221, 74.0] + - - [1408, 32, 1, 256, 1408, 1408, 256, 256] + - [178, 1611.0] + - - [2368, 1, 1, 1, 2368, 2368, 1, 1] + - [210, 1.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [116, 3056.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [160, 2497.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [184, 2892.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [154, 3214.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [148, 2067.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [184, 3143.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [184, 3012.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [160, 2757.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [135, 2967.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [135, 2825.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [135, 2240.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [214, 343.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [219, 884.0] + - - [32768, 1, 1, 256, 32768, 32768, 256, 256] + - [214, 248.0] + - - [1600, 1, 1, 1024, 1600, 1600, 1024, 1024] + - [219, 124.0] + - - [3456, 1, 1, 256, 3456, 3456, 256, 256] + - [216, 130.0] + - - [4096, 1, 1, 256, 4096, 4096, 256, 256] + - [219, 135.0] + - - [6912, 1, 1, 256, 6912, 6912, 256, 256] + - [219, 159.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [214, 1627.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [219, 479.0] + - - [29000, 27, 1, 2560, 29000, 29000, 2560, 2560] + - [184, 2845.0] + - - [1, 4288, 1, 1280, 1, 1, 1280, 1280] + - [231, 199.0] + - - [32, 1408, 1, 32, 32, 32, 32, 32] + - [114, 890.0] + - - [1, 1408, 1, 3328, 1, 1, 3328, 3328] + - [158, 159.0] + - - [1, 2368, 1, 1280, 1, 1, 1280, 1280] + - [146, 172.0] + - - [1, 5888, 1, 3328, 1, 1, 3328, 3328] + - [225, 200.0] + - - [1, 1856, 1, 256, 1, 1, 256, 256] + - [228, 86.0] + - - [1, 3584, 1, 3328, 1, 1, 3328, 3328] + - [231, 212.0] + - - [1, 6784, 1, 3328, 1, 1, 3328, 3328] + - [133, 194.0] + - - [1, 2368, 1, 256, 1, 1, 256, 256] + - [127, 89.0] + - - [32, 1856, 1, 3328, 32, 32, 3328, 3328] + - [227, 4079.0] + - - [1, 2944, 1, 1280, 1, 1, 1280, 1280] + - [127, 170.0] + - - [1, 1856, 1, 3328, 1, 1, 3328, 3328] + - [170, 185.0] + - - [1, 1408, 1, 1, 1, 1, 1, 1] + - [116, 0.36] + - - [1, 6784, 1, 256, 1, 1, 256, 256] + - [176, 160.0] + - - [1, 6784, 1, 1, 1, 1, 1, 1] + - [114, 2.0] + - - [1, 4288, 1, 3328, 1, 1, 3328, 3328] + - [127, 196.0] + - - [1, 2368, 1, 3328, 1, 1, 3328, 3328] + - [176, 187.0] + - - [1, 5888, 1, 1280, 1, 1, 1280, 1280] + - [224, 189.0] + - - [1, 2944, 1, 256, 1, 1, 256, 256] + - [231, 101.0] + - - [1, 6784, 1, 1280, 1, 1, 1280, 1280] + - [231, 185.0] + - - [1, 5056, 1, 1, 1, 1, 1, 1] + - [114, 3.0] + - - [32, 1856, 1, 32, 32, 32, 32, 32] + - [127, 1068.0] + - - [32, 1408, 1, 256, 32, 32, 256, 256] + - [223, 2383.0] + - - [1, 5888, 1, 1, 1, 1, 1, 1] + - [116, 3.0] + - - [1, 2944, 1, 3328, 1, 1, 3328, 3328] + - [176, 199.0] + - - [1, 3584, 1, 1, 1, 1, 1, 1] + - [222, 2.0] + - - [1, 1408, 1, 256, 1, 1, 256, 256] + - [170, 93.0] + - - [1, 1856, 1, 1, 1, 1, 1, 1] + - [161, 1.0] + - - [1, 5056, 1, 1280, 1, 1, 1280, 1280] + - [127, 197.0] + - - [1, 5888, 1, 256, 1, 1, 256, 256] + - [225, 174.0] + - - [32, 1856, 1, 1280, 32, 32, 1280, 1280] + - [227, 3756.0] + - - [1, 2368, 1, 1, 1, 1, 1, 1] + - [114, 1.0] + - - [1, 1408, 1, 1280, 1, 1, 1280, 1280] + - [146, 139.0] + - - [1, 5056, 1, 256, 1, 1, 256, 256] + - [231, 177.0] + - - [1, 3584, 1, 1280, 1, 1, 1280, 1280] + - [229, 205.0] + - - [1, 4288, 1, 256, 1, 1, 256, 256] + - [127, 160.0] + - - [1, 4288, 1, 1, 1, 1, 1, 1] + - [116, 2.0] + - - [1, 2944, 1, 1, 1, 1, 1, 1] + - [222, 2.0] + - - [32, 1408, 1, 3328, 32, 32, 3328, 3328] + - [223, 3400.0] + - - [1, 5056, 1, 3328, 1, 1, 3328, 3328] + - [231, 209.0] + - - [32, 1856, 1, 256, 32, 32, 256, 256] + - [230, 3154.0] + - - [1, 1856, 1, 1280, 1, 1, 1280, 1280] + - [133, 172.0] + - - [1, 3584, 1, 256, 1, 1, 256, 256] + - [225, 165.0] + - - [32, 1408, 1, 1280, 32, 32, 1280, 1280] + - [224, 3258.0] + - - [2, 2048, 1, 1024, 2, 2, 1024, 1024] + - [133, 371.0] + - - [32, 1600, 1, 512, 32, 32, 512, 512] + - [226, 2401.0] + - - [1, 4096, 1, 256, 1, 1, 256, 256] + - [225, 118.0] + - - [1, 6912, 1, 256, 1, 1, 256, 256] + - [229, 177.0] + - - [2, 2048, 1, 768, 2, 2, 768, 768] + - [225, 291.0] + - - [2, 4608, 1, 768, 2, 2, 768, 768] + - [225, 346.0] + - - [2, 4608, 1, 1024, 2, 2, 1024, 1024] + - [133, 325.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [206, 2469.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [198, 1435.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [208, 2679.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [198, 1233.0] + - - [147, 64, 1, 12544, 147, 147, 12544, 12544] + - [188, 2552.0] + - - [256, 128, 1, 10752, 256, 256, 10752, 10752] + - [189, 4125.0] + - - [256, 128, 1, 10560, 256, 256, 10560, 10560] + - [200, 5248.0] + - - [256, 128, 1, 11408, 256, 256, 11408, 11408] + - [195, 5716.0] + - - [256, 12, 1, 11408, 256, 256, 11408, 11408] + - [190, 1110.0] + - - [256, 128, 1, 11616, 256, 256, 11616, 11616] + - [195, 5603.0] + - - [256, 12, 1, 11616, 256, 256, 11616, 11616] + - [190, 1117.0] + - - [256, 12, 1, 12288, 256, 256, 12288, 12288] + - [209, 1114.0] + - - [576, 64, 1, 5625, 576, 576, 5625, 5625] + - [195, 4488.0] + - - [147, 64, 1, 22500, 147, 147, 22500, 22500] + - [199, 2856.0] + - - [11, 11, 1, 1909283, 11, 11, 1909283, 1909283] + - [194, 41.0] + - - [11, 11, 1, 3818566, 11, 11, 3818566, 3818566] + - [190, 33.0] + - - [448, 1, 1, 256, 448, 448, 256, 256] + - [164, 27.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [147, 3584.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [131, 2255.0] + - - [448, 64, 1, 1, 448, 448, 1, 1] + - [116, 8.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [143, 3570.0] + - - [1024, 1, 1, 3328, 1024, 1024, 3328, 3328] + - [160, 114.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [117, 3305.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [156, 2079.0] + - - [1, 1024, 1, 3328, 1, 1, 3328, 3328] + - [133, 122.0] + - - [704, 64, 1, 32, 704, 704, 32, 32] + - [122, 364.0] + - - [32, 448, 1, 3328, 32, 32, 3328, 3328] + - [182, 1904.0] + - - [448, 1, 1, 1, 448, 448, 1, 1] + - [122, 0.12] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [120, 1211.0] + - - [64, 128, 1, 1, 64, 64, 1, 1] + - [114, 2.0] + - - [256, 128, 1, 1, 256, 256, 1, 1] + - [114, 8.0] + - - [256, 32, 1, 3328, 256, 256, 3328, 3328] + - [120, 1193.0] + - - [1, 1, 1, 3328, 1, 1, 3328, 3328] + - [163, 0.16] + - - [32, 448, 1, 1280, 32, 32, 1280, 1280] + - [170, 1475.0] + - - [32, 448, 1, 32, 32, 32, 32, 32] + - [122, 119.0] + - - [64, 1024, 1, 32, 64, 64, 32, 32] + - [156, 530.0] + - - [128, 1, 1, 1, 128, 128, 1, 1] + - [114, 0.03] + - - [1024, 32, 1, 3328, 1024, 1024, 3328, 3328] + - [180, 2790.0] + - - [448, 1, 1, 1280, 448, 448, 1280, 1280] + - [133, 49.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [182, 478.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [118, 3877.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [176, 2576.0] + - - [256, 256, 1, 32, 256, 256, 32, 32] + - [168, 502.0] + - - [1024, 1, 1, 256, 1024, 1024, 256, 256] + - [127, 44.0] + - - [128, 32, 1, 32, 128, 128, 32, 32] + - [122, 33.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [178, 1211.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [176, 3055.0] + - - [1, 64, 1, 3328, 1, 1, 3328, 3328] + - [120, 9.0] + - - [64, 1024, 1, 1, 64, 64, 1, 1] + - [114, 16.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [117, 4023.0] + - - [32, 704, 1, 3328, 32, 32, 3328, 3328] + - [170, 2738.0] + - - [32, 1024, 1, 3328, 32, 32, 3328, 3328] + - [118, 2862.0] + - - [64, 1, 1, 256, 64, 64, 256, 256] + - [185, 4.0] + - - [1024, 64, 1, 32, 1024, 1024, 32, 32] + - [123, 1070.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [143, 3755.0] + - - [32, 1024, 1, 256, 32, 32, 256, 256] + - [168, 1307.0] + - - [64, 1, 1, 1, 64, 64, 1, 1] + - [116, 0.02] + - - [256, 1, 1, 256, 256, 256, 256, 256] + - [114, 12.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [176, 3004.0] + - - [64, 64, 1, 1, 64, 64, 1, 1] + - [114, 1.0] + - - [32, 704, 1, 1280, 32, 32, 1280, 1280] + - [182, 2158.0] + - - [256, 1, 1, 1280, 256, 256, 1280, 1280] + - [182, 29.0] + - - [128, 32, 1, 1280, 128, 128, 1280, 1280] + - [133, 488.0] + - - [128, 256, 1, 1, 128, 128, 1, 1] + - [147, 17.0] + - - [1, 256, 1, 256, 1, 1, 256, 256] + - [114, 11.0] + - - [1, 256, 1, 1, 1, 1, 1, 1] + - [114, 0.06] + - - [1024, 1, 1, 1280, 1024, 1024, 1280, 1280] + - [182, 116.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [127, 2005.0] + - - [1024, 32, 1, 1280, 1024, 1024, 1280, 1280] + - [120, 3048.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [165, 4309.0] + - - [704, 32, 1, 1280, 704, 704, 1280, 1280] + - [158, 2072.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [182, 609.0] + - - [32, 32, 1, 32, 32, 32, 32, 32] + - [114, 8.0] + - - [1024, 32, 1, 32, 1024, 1024, 32, 32] + - [114, 672.0] + - - [128, 64, 1, 32, 128, 128, 32, 32] + - [114, 158.0] + - - [64, 1, 1, 1280, 64, 64, 1280, 1280] + - [120, 7.0] + - - [448, 32, 1, 1280, 448, 448, 1280, 1280] + - [120, 1477.0] + - - [704, 32, 1, 3328, 704, 704, 3328, 3328] + - [170, 2844.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [146, 1263.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [120, 3053.0] + - - [64, 256, 1, 1, 64, 64, 1, 1] + - [118, 10.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [128, 3251.0] + - - [448, 1, 1, 3328, 448, 448, 3328, 3328] + - [158, 63.0] + - - [256, 1, 1, 1, 256, 256, 1, 1] + - [129, 0.07] + - - [32, 1024, 1, 1280, 32, 32, 1280, 1280] + - [133, 2545.0] + - - [1, 256, 1, 3328, 1, 1, 3328, 3328] + - [120, 38.0] + - - [256, 32, 1, 256, 256, 256, 256, 256] + - [182, 380.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [159, 2605.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [170, 1128.0] + - - [1, 1, 1, 1, 1, 1, 1, 1] + - [164, 0.0005494505479547652] + - - [32, 1024, 1, 32, 32, 32, 32, 32] + - [164, 632.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [131, 2231.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [154, 2541.0] + - - [704, 1, 1, 1, 704, 704, 1, 1] + - [114, 0.41] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [144, 3466.0] + - - [448, 32, 1, 32, 448, 448, 32, 32] + - [118, 115.0] + - - [704, 64, 1, 1, 704, 704, 1, 1] + - [168, 12.0] + - - [704, 32, 1, 256, 704, 704, 256, 256] + - [129, 949.0] + - - [32, 704, 1, 32, 32, 32, 32, 32] + - [127, 180.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [120, 390.0] + - - [448, 32, 1, 3328, 448, 448, 3328, 3328] + - [120, 1947.0] + - - [64, 704, 1, 32, 64, 64, 32, 32] + - [120, 366.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [171, 3567.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [158, 2483.0] + - - [128, 448, 1, 32, 128, 128, 32, 32] + - [114, 1031.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [180, 763.0] + - - [64, 704, 1, 1, 64, 64, 1, 1] + - [114, 11.0] + - - [1, 1024, 1, 1, 1, 1, 1, 1] + - [114, 0.26] + - - [256, 1, 1, 3328, 256, 256, 3328, 3328] + - [120, 37.0] + - - [32, 64, 1, 32, 32, 32, 32, 32] + - [116, 17.0] + - - [256, 256, 1, 1, 256, 256, 1, 1] + - [131, 17.0] + - - [32, 256, 1, 32, 32, 32, 32, 32] + - [144, 66.0] + - - [128, 1, 1, 256, 128, 128, 256, 256] + - [120, 6.0] + - - [32, 64, 1, 3328, 32, 32, 3328, 3328] + - [158, 305.0] + - - [1, 128, 1, 3328, 1, 1, 3328, 3328] + - [120, 19.0] + - - [32, 256, 1, 256, 32, 32, 256, 256] + - [133, 413.0] + - - [1, 448, 1, 1, 1, 1, 1, 1] + - [122, 0.12] + - - [1, 704, 1, 3328, 1, 1, 3328, 3328] + - [170, 91.0] + - - [64, 1, 1, 3328, 64, 64, 3328, 3328] + - [124, 10.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [120, 2767.0] + - - [256, 32, 1, 1280, 256, 256, 1280, 1280] + - [120, 935.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [144, 3901.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [177, 2135.0] + - - [64, 32, 1, 32, 64, 64, 32, 32] + - [114, 17.0] + - - [1, 448, 1, 3328, 1, 1, 3328, 3328] + - [174, 65.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [128, 2162.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [176, 3074.0] + - - [64, 32, 1, 3328, 64, 64, 3328, 3328] + - [120, 305.0] + - - [64, 448, 1, 1, 64, 64, 1, 1] + - [182, 18.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [122, 1956.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [182, 397.0] + - - [64, 448, 1, 32, 64, 64, 32, 32] + - [156, 243.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [133, 1226.0] + - - [32, 64, 1, 1280, 32, 32, 1280, 1280] + - [120, 237.0] + - - [448, 32, 1, 256, 448, 448, 256, 256] + - [135, 651.0] + - - [1024, 32, 1, 256, 1024, 1024, 256, 256] + - [180, 1323.0] + - - [1, 128, 1, 256, 1, 1, 256, 256] + - [120, 6.0] + - - [32, 256, 1, 1280, 32, 32, 1280, 1280] + - [146, 923.0] + - - [32, 128, 1, 3328, 32, 32, 3328, 3328] + - [133, 604.0] + - - [32, 128, 1, 32, 32, 32, 32, 32] + - [175, 34.0] + - - [1, 128, 1, 1, 1, 1, 1, 1] + - [114, 0.07] + - - [128, 64, 1, 1, 128, 128, 1, 1] + - [114, 2.0] + - - [32, 448, 1, 256, 32, 32, 256, 256] + - [144, 646.0] + - - [1, 704, 1, 256, 1, 1, 256, 256] + - [150, 47.0] + - - [32, 256, 1, 3328, 32, 32, 3328, 3328] + - [170, 1199.0] + - - [256, 32, 1, 32, 256, 256, 32, 32] + - [123, 68.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [170, 2195.0] + - - [1, 704, 1, 1, 1, 1, 1, 1] + - [114, 0.17] + - - [128, 448, 1, 1, 128, 128, 1, 1] + - [114, 14.0] + - - [64, 128, 1, 32, 64, 64, 32, 32] + - [116, 64.0] + - - [704, 1, 1, 1280, 704, 704, 1280, 1280] + - [124, 70.0] + - - [1024, 1, 1, 1, 1024, 1024, 1, 1] + - [114, 1.0] + - - [256, 128, 1, 32, 256, 256, 32, 32] + - [142, 256.0] + - - [448, 128, 1, 1, 448, 448, 1, 1] + - [114, 14.0] + - - [704, 32, 1, 32, 704, 704, 32, 32] + - [126, 183.0] + - - [128, 32, 1, 256, 128, 128, 256, 256] + - [182, 190.0] + - - [64, 32, 1, 1280, 64, 64, 1280, 1280] + - [133, 241.0] + - - [448, 128, 1, 32, 448, 448, 32, 32] + - [114, 1055.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [131, 2022.0] + - - [32, 32, 1, 256, 32, 32, 256, 256] + - [135, 49.0] + - - [256, 64, 1, 32, 256, 256, 32, 32] + - [114, 308.0] + - - [1, 1024, 1, 1280, 1, 1, 1280, 1280] + - [146, 101.0] + - - [32, 32, 1, 3328, 32, 32, 3328, 3328] + - [137, 152.0] + - - [1, 256, 1, 1280, 1, 1, 1280, 1280] + - [114, 31.0] + - - [1, 128, 1, 1280, 1, 1, 1280, 1280] + - [120, 14.0] + - - [1, 64, 1, 256, 1, 1, 256, 256] + - [114, 3.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [133, 1765.0] + - - [32, 704, 1, 256, 32, 32, 256, 256] + - [170, 952.0] + - - [1, 64, 1, 1, 1, 1, 1, 1] + - [116, 0.02] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [183, 2875.0] + - - [1, 704, 1, 1280, 1, 1, 1280, 1280] + - [114, 72.0] + - - [128, 128, 1, 32, 128, 128, 32, 32] + - [114, 305.0] + - - [1024, 64, 1, 1, 1024, 1024, 1, 1] + - [114, 40.0] + - - [704, 1, 1, 256, 704, 704, 256, 256] + - [114, 49.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [148, 1040.0] + - - [64, 64, 1, 32, 64, 64, 32, 32] + - [123, 34.0] + - - [1, 1, 1, 1280, 1, 1, 1280, 1280] + - [120, 0.11] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [166, 1652.0] + - - [1, 448, 1, 1280, 1, 1, 1280, 1280] + - [133, 49.0] + - - [64, 256, 1, 32, 64, 64, 32, 32] + - [125, 200.0] + - - [32, 128, 1, 1280, 32, 32, 1280, 1280] + - [133, 475.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [120, 2174.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [176, 3001.0] + - - [32, 64, 1, 256, 32, 32, 256, 256] + - [133, 96.0] + - - [128, 256, 1, 32, 128, 128, 32, 32] + - [122, 256.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [170, 1722.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [129, 190.0] + - - [448, 64, 1, 32, 448, 448, 32, 32] + - [139, 229.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [158, 945.0] + - - [1, 1024, 1, 256, 1, 1, 256, 256] + - [114, 68.0] + - - [128, 1, 1, 3328, 128, 128, 3328, 3328] + - [120, 19.0] + - - [128, 128, 1, 1, 128, 128, 1, 1] + - [114, 10.0] + - - [32, 128, 1, 256, 32, 32, 256, 256] + - [114, 312.0] + - - [1, 64, 1, 1280, 1, 1, 1280, 1280] + - [114, 8.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [131, 3446.0] + - - [256, 64, 1, 1, 256, 256, 1, 1] + - [114, 10.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [147, 4029.0] + - - [704, 1, 1, 3328, 704, 704, 3328, 3328] + - [137, 86.0] + - - [128, 32, 1, 3328, 128, 128, 3328, 3328] + - [133, 610.0] + - - [32, 32, 1, 1280, 32, 32, 1280, 1280] + - [158, 119.0] + - - [1, 1, 1, 256, 1, 1, 256, 256] + - [129, 0.05] + - - [1, 448, 1, 256, 1, 1, 256, 256] + - [120, 20.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [120, 2152.0] + - - [64, 32, 1, 256, 64, 64, 256, 256] + - [133, 95.0] + - - [128, 1, 1, 1280, 128, 128, 1280, 1280] + - [133, 15.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [115, 3416.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [120, 2166.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [170, 2198.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [141, 3448.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [170, 599.0] + - - [14, 14, 1, 64, 14, 14, 64, 64] + - [114, 5.0] + - - [15, 14, 1, 64, 15, 15, 64, 64] + - [114, 6.0] + - - [15, 15, 1, 64, 15, 15, 64, 64] + - [114, 3.0] + - - [17, 15, 1, 64, 17, 17, 64, 64] + - [116, 4.0] + - - [17, 17, 1, 64, 17, 17, 64, 64] + - [114, 4.0] + - - [21, 17, 1, 64, 21, 21, 64, 64] + - [114, 5.0] + - - [21, 21, 1, 64, 21, 21, 64, 64] + - [114, 6.0] + - - [24, 24, 1, 64, 24, 24, 64, 64] + - [114, 8.0] + - - [30, 30, 1, 64, 30, 30, 64, 64] + - [114, 25.0] + - - [30, 31, 1, 64, 30, 30, 64, 64] + - [166, 14.0] + - - [31, 31, 1, 64, 31, 31, 64, 64] + - [118, 14.0] + - - [32, 32, 1, 64, 32, 32, 64, 64] + - [114, 35.0] + - - [32, 35, 1, 64, 32, 32, 64, 64] + - [116, 16.0] + - - [34, 24, 1, 64, 34, 34, 64, 64] + - [142, 12.0] + - - [34, 34, 1, 64, 34, 34, 64, 64] + - [162, 18.0] + - - [35, 35, 1, 64, 35, 35, 64, 64] + - [114, 18.0] + - - [27, 27, 1, 64, 27, 27, 64, 64] + - [114, 10.0] + - - [27, 33, 1, 64, 27, 27, 64, 64] + - [166, 14.0] + - - [33, 33, 1, 64, 33, 33, 64, 64] + - [118, 16.0] + - - [2, 4, 1, 1024, 2, 2, 1024, 1024] + - [114, 1.0] + - - [2, 32, 1, 1024, 2, 2, 1024, 1024] + - [114, 7.0] + - - [64, 512, 1, 512, 64, 64, 512, 512] + - [176, 1868.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 364.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 2467.0] + - - [3, 3, 512, 64, 3, 3, 64, 64] + - [114, 53.0] + - - [5, 5, 512, 64, 5, 5, 64, 64] + - [127, 139.0] + - - [5, 5, 960, 64, 5, 5, 64, 64] + - [127, 213.0] + - - [9, 9, 512, 64, 9, 9, 64, 64] + - [176, 434.0] + - - [27, 27, 32768, 128, 27, 27, 128, 128] + - [121, 3635.0] + - - [64, 512, 1, 1024, 64, 64, 1024, 1024] + - [152, 2918.0] + - - [64, 960, 1, 1024, 64, 64, 1024, 1024] + - [131, 3865.0] + - - [14, 14, 10880, 64, 14, 14, 64, 64] + - [120, 2591.0] + - - [15, 14, 10880, 64, 15, 15, 64, 64] + - [158, 2758.0] + - - [15, 15, 7680, 64, 15, 15, 64, 64] + - [158, 2894.0] + - - [15, 15, 10880, 64, 15, 15, 64, 64] + - [158, 2832.0] + - - [17, 15, 7680, 64, 17, 17, 64, 64] + - [178, 2138.0] + - - [17, 17, 7680, 64, 17, 17, 64, 64] + - [153, 1882.0] + - - [21, 17, 6144, 64, 21, 21, 64, 64] + - [153, 2108.0] + - - [21, 21, 6144, 64, 21, 21, 64, 64] + - [147, 2254.0] + - - [24, 24, 4736, 64, 24, 24, 64, 64] + - [159, 2834.0] + - - [30, 30, 2048, 64, 30, 30, 64, 64] + - [158, 3178.0] + - - [30, 31, 2048, 64, 30, 30, 64, 64] + - [168, 3397.0] + - - [31, 31, 2048, 64, 31, 31, 64, 64] + - [144, 3401.0] + - - [34, 24, 4736, 64, 34, 34, 64, 64] + - [179, 2607.0] + - - [27, 27, 1920, 64, 27, 27, 64, 64] + - [159, 3246.0] + - - [27, 33, 1920, 64, 27, 27, 64, 64] + - [115, 2496.0] + - - [2, 8, 1, 1024, 2, 2, 1024, 1024] + - [120, 2.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [177, 3413.0] + - - [2, 10, 1, 1024, 2, 2, 1024, 1024] + - [114, 2.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 887.0] + - - [2, 39, 1, 1024, 2, 2, 1024, 1024] + - [120, 8.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 2361.0] + - - [2, 40, 1, 1024, 2, 2, 1024, 1024] + - [120, 8.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [178, 2422.0] + - - [2, 41, 1, 1024, 2, 2, 1024, 1024] + - [114, 8.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [176, 2491.0] + - - [2, 5, 1, 1024, 2, 2, 1024, 1024] + - [114, 1.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 430.0] + - - [2, 6, 1, 1024, 2, 2, 1024, 1024] + - [114, 1.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [182, 527.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 694.0] + - - [2, 9, 1, 1024, 2, 2, 1024, 1024] + - [114, 2.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 786.0] + - - [4, 4, 32768, 64, 4, 4, 64, 64] + - [176, 308.0] + - - [4, 4, 38400, 64, 4, 4, 64, 64] + - [176, 329.0] + - - [17, 17, 6144, 64, 17, 17, 64, 64] + - [128, 1811.0] + - - [128, 128, 1, 64, 128, 128, 64, 64] + - [140, 253.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [180, 229.0] + - - [2, 1024, 1, 1024, 2, 2, 1024, 1024] + - [182, 186.0] + - - [5, 5, 1, 64, 5, 5, 64, 64] + - [129, 0.35] + - - [33, 33, 1, 32, 33, 33, 32, 32] + - [114, 8.0] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [182, 1456.0] + - - [2, 4, 1, 2560, 2, 2, 2560, 2560] + - [114, 1.0] + - - [2, 16, 1, 1024, 2, 2, 1024, 1024] + - [114, 3.0] + - - [2, 2, 1, 2048, 2, 2, 2048, 2048] + - [120, 1.0] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [133, 90.0] + - - [512, 1, 1, 2048, 512, 512, 2048, 2048] + - [158, 66.0] + - - [200, 1, 1, 1024, 200, 200, 1024, 1024] + - [133, 21.0] + - - [960, 1, 1, 2048, 960, 960, 2048, 2048] + - [133, 106.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [181, 3280.0] + - - [864, 1, 1, 256, 864, 864, 256, 256] + - [176, 58.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [153, 3481.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [177, 3533.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [186, 1108.0] + - - [2, 64, 1, 1024, 2, 2, 1024, 1024] + - [127, 15.0] + - - [2, 80, 1, 1024, 2, 2, 1024, 1024] + - [127, 19.0] + - - [2, 82, 1, 1024, 2, 2, 1024, 1024] + - [137, 21.0] + - - [2, 12, 1, 1024, 2, 2, 1024, 1024] + - [186, 3.0] + - - [2, 1, 1, 1024, 2, 2, 1024, 1024] + - [120, 0.26] + - - [24, 24, 6816, 64, 24, 24, 64, 64] + - [171, 2926.0] + - - [256, 128, 1, 3136, 256, 256, 3136, 3136] + - [140, 3536.0] + - - [576, 64, 1, 3136, 576, 576, 3136, 3136] + - [114, 3380.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [137, 1004.0] + - - [768, 12, 1, 768, 768, 768, 768, 768] + - [146, 736.0] + - - [768, 4, 1, 768, 768, 768, 768, 768] + - [133, 246.0] + - - [64, 1024, 1, 1024, 64, 64, 1024, 1024] + - [177, 3551.0] + - - [26, 26, 6272, 64, 26, 26, 64, 64] + - [159, 3276.0] + - - [2, 128, 1, 1024, 2, 2, 1024, 1024] + - [120, 35.0] + - - [2, 96, 1, 1024, 2, 2, 1024, 1024] + - [120, 26.0] + - - [256, 80, 1, 784, 256, 256, 784, 784] + - [170, 1512.0] + - - [256, 12, 1, 3800, 256, 256, 3800, 3800] + - [163, 455.0] + - - [256, 3, 1, 3800, 256, 256, 3800, 3800] + - [124, 115.0] + - - [256, 12, 1, 950, 256, 256, 950, 950] + - [137, 283.0] + - - [256, 3, 1, 950, 256, 256, 950, 950] + - [150, 73.0] + - - [256, 12, 1, 3220, 256, 256, 3220, 3220] + - [137, 442.0] + - - [256, 3, 1, 3220, 256, 256, 3220, 3220] + - [137, 111.0] + - - [256, 12, 1, 3072, 256, 256, 3072, 3072] + - [163, 450.0] + - - [256, 3, 1, 3072, 256, 256, 3072, 3072] + - [137, 113.0] + - - [256, 12, 1, 850, 256, 256, 850, 850] + - [163, 269.0] + - - [256, 3, 1, 850, 256, 256, 850, 850] + - [124, 69.0] + - - [256, 12, 1, 2852, 256, 256, 2852, 2852] + - [174, 475.0] + - - [256, 3, 1, 2852, 256, 256, 2852, 2852] + - [163, 109.0] + - - [256, 12, 1, 805, 256, 256, 805, 805] + - [124, 264.0] + - - [256, 3, 1, 805, 256, 256, 805, 805] + - [122, 70.0] + - - [256, 3, 1, 864, 256, 256, 864, 864] + - [137, 70.0] + - - [256, 3, 1, 768, 256, 256, 768, 768] + - [120, 67.0] + - - [256, 12, 1, 864, 256, 256, 864, 864] + - [164, 310.0] + - - [256, 12, 1, 768, 256, 256, 768, 768] + - [120, 270.0] + - - [256, 12, 1, 2904, 256, 256, 2904, 2904] + - [163, 428.0] + - - [256, 3, 1, 2904, 256, 256, 2904, 2904] + - [124, 108.0] + - - [256, 3, 1, 713, 256, 256, 713, 713] + - [174, 63.0] + - - [256, 12, 1, 888, 256, 256, 888, 888] + - [150, 279.0] + - - [256, 3, 1, 888, 256, 256, 888, 888] + - [124, 70.0] + - - [256, 12, 1, 713, 256, 256, 713, 713] + - [124, 247.0] + - - [256, 3, 1, 660, 256, 256, 660, 660] + - [174, 60.0] + - - [256, 3, 1, 672, 256, 256, 672, 672] + - [124, 61.0] + - - [256, 12, 1, 660, 256, 256, 660, 660] + - [124, 233.0] + - - [256, 3, 1, 726, 256, 256, 726, 726] + - [124, 63.0] + - - [256, 12, 1, 672, 256, 256, 672, 672] + - [174, 254.0] + - - [256, 3, 1, 247, 256, 256, 247, 247] + - [146, 32.0] + - - [256, 12, 1, 726, 256, 256, 726, 726] + - [163, 250.0] + - - [256, 3, 1, 216, 256, 256, 216, 216] + - [114, 36.0] + - - [256, 3, 1, 3400, 256, 256, 3400, 3400] + - [137, 112.0] + - - [256, 3, 1, 221, 256, 256, 221, 221] + - [114, 38.0] + - - [256, 12, 1, 3552, 256, 256, 3552, 3552] + - [137, 453.0] + - - [256, 3, 1, 3456, 256, 256, 3456, 3456] + - [124, 115.0] + - - [256, 3, 1, 204, 256, 256, 204, 204] + - [120, 27.0] + - - [256, 12, 1, 3400, 256, 256, 3400, 3400] + - [137, 444.0] + - - [256, 12, 1, 3456, 256, 256, 3456, 3456] + - [186, 458.0] + - - [256, 12, 1, 221, 256, 256, 221, 221] + - [172, 114.0] + - - [256, 3, 1, 3552, 256, 256, 3552, 3552] + - [124, 114.0] + - - [256, 3, 1, 228, 256, 256, 228, 228] + - [120, 29.0] + - - [256, 3, 1, 234, 256, 256, 234, 234] + - [133, 30.0] + - - [256, 12, 1, 234, 256, 256, 234, 234] + - [170, 118.0] + - - [256, 12, 1, 228, 256, 256, 228, 228] + - [171, 137.0] + - - [256, 3, 1, 252, 256, 256, 252, 252] + - [140, 39.0] + - - [256, 12, 1, 252, 256, 256, 252, 252] + - [137, 186.0] + - - [256, 12, 1, 247, 256, 256, 247, 247] + - [172, 189.0] + - - [128, 256, 1, 1444, 128, 128, 1444, 1444] + - [134, 2555.0] + - - [256, 128, 1, 25, 256, 256, 25, 25] + - [122, 398.0] + - - [256, 128, 1, 9, 256, 256, 9, 9] + - [149, 162.0] + - - [256, 256, 1, 1444, 256, 256, 1444, 1444] + - [114, 3694.0] + - - [512, 128, 1, 100, 512, 512, 100, 100] + - [164, 1196.0] + - - [64, 128, 1, 1444, 64, 64, 1444, 1444] + - [124, 827.0] + - - [81, 1024, 1, 1024, 81, 81, 1024, 1024] + - [156, 3153.0] + - - [81, 1000, 1, 1024, 81, 81, 1024, 1024] + - [180, 3114.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [180, 1409.0] + - - [2, 8, 1, 2048, 2, 2, 2048, 2048] + - [114, 2.0] + - - [2, 20, 1, 1024, 2, 2, 1024, 1024] + - [114, 4.0] + - - [2, 2, 1, 2560, 2, 2, 2560, 2560] + - [120, 1.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB_GB.yaml new file mode 100644 index 000000000..06c48df17 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_HB_GB.yaml @@ -0,0 +1,25055 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 256 + LVCA: 2 + LVCB: 1 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x256x16_SN_SU32_SUM3_TT16_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x256x16_SN_SU0_SUM0_TT16_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x256x16_SN_SU32_SUM3_TT16_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x128x16_SN_SU0_SUM0_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x256x16_SN_SU0_SUM0_TT16_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x128x16_SN_SU32_SUM3_TT16_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT256x256x16_SN_SU32_SUM3_TT16_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 16] + ThreadTile0: 16 + ThreadTile1: 16 + ThreadTileA: 16 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 28672 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 8192 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x256x32_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 819 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 16 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 2 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_HB_GB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [5, 23606.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [5, 22741.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [13, 21945.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [19, 18950.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [8, 23415.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [32, 22123.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [33, 16874.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 24358.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [19, 20236.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 24366.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [34, 23653.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [16, 22299.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [35, 24361.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [5, 18980.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 23094.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [18, 17695.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 24259.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [34, 24224.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [19, 17095.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [27, 15642.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [25, 24086.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [5, 20876.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 24265.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 19992.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 24249.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [13, 22139.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [36, 20783.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [27, 16442.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 24692.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [13, 22858.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [5, 22121.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 22439.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 24556.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [16, 22033.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [10, 23975.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [10, 18332.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 24268.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [16, 22439.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [25, 19103.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [31, 22126.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [11, 23953.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [32, 22984.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [5, 18334.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [28, 24440.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [18, 16963.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 20969.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 21510.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [27, 16144.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 22525.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [31, 19322.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [6, 19277.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [10, 23849.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [34, 18518.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [4, 16838.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [15, 15672.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [19, 18108.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [13, 20000.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [31, 18492.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [13, 18649.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [31, 19013.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [11, 21989.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [28, 20756.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [16, 24253.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 22717.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 23881.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [22, 22630.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [34, 23676.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [13, 23890.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [26, 22390.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [5, 17657.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [26, 24478.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [5, 24106.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 21714.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [22, 23707.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [16, 25037.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [8, 25017.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [21, 16582.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 20977.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [22, 23553.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [5, 23052.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [10, 19583.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [4, 17216.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 18285.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [5, 21643.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [16, 23049.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [22, 23167.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [8, 23575.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 23609.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [16, 20896.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [10, 18012.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [25, 19115.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [5, 23978.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [13, 22636.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [23, 24327.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [3, 21487.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [10, 24255.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [10, 23392.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [5, 19175.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [28, 20467.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [34, 24608.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [19, 17555.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [19, 23761.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [25, 22821.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 23769.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [26, 23742.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 24521.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 23146.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [22, 24051.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [19, 22921.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [20, 22321.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [16, 22662.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [5, 20412.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [28, 19806.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [26, 22842.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [27, 15966.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [35, 22078.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [4, 20007.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [31, 22720.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [8, 21511.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [10, 22764.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [10, 21051.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 20317.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [19, 15313.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [10, 24358.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [10, 20840.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [26, 25029.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [16, 21525.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [26, 24679.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [15, 16882.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [20, 24508.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [3, 23538.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [20, 24933.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [5, 22466.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [11, 24141.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [28, 21321.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [34, 23971.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [19, 23125.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [25, 19213.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 23304.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [13, 22552.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [13, 23015.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [5, 18300.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 23738.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [31, 20806.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [13, 24241.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [10, 22409.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 22177.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [13, 23496.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [26, 25193.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [30, 10755.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 24228.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [7, 17657.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 22535.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [22, 19523.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [19, 22610.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [5, 24130.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [15, 17867.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 22314.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [13, 23879.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 23341.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [19, 23678.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [28, 22482.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [34, 19606.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [22, 19349.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [13, 23781.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [19, 21887.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [10, 23879.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [13, 22987.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [10, 19639.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [19, 16706.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [22, 23725.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [35, 25035.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [23, 24332.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [35, 22590.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [8, 23843.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 23541.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [25, 22581.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 23243.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [10, 21291.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [8, 23185.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 23530.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [16, 20242.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 23006.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [28, 22107.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 18018.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [22, 18533.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [16, 19347.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [5, 20337.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [30, 14602.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [13, 15697.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [8, 21157.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [13, 21820.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 21917.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [22, 21788.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [10, 24675.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [4, 14546.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [31, 20919.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [22, 24594.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [5, 25062.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [11, 24188.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [13, 24289.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [32, 23765.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [10, 19211.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [34, 25001.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [9, 16058.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [5, 24045.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [35, 23043.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [5, 18361.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [28, 19017.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [16, 21008.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [16, 23081.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [13, 18741.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 23833.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [19, 18238.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [31, 18033.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [18, 16295.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [14, 22719.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [10, 20530.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 24218.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [13, 22394.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [2, 17741.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [5, 24235.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [35, 24224.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [10, 17834.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [28, 22424.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [26, 20867.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [5, 21185.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [13, 19230.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [19, 22935.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [5, 24701.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 24219.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [13, 22179.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [16, 24089.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 22123.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [19, 21734.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [10, 24322.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 22718.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [3, 22425.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 22224.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [11, 19203.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [14, 24250.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 20515.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [18, 17476.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [5, 24383.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 24639.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [5, 19762.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [31, 23976.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [7, 22263.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [19, 22949.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [26, 22204.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 24729.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [19, 22083.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 22790.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [31, 19865.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [31, 19428.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [22, 24386.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [5, 22637.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [5, 18503.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [25, 21244.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [14, 21912.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 24278.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 23008.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [2, 19908.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 23084.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [31, 20545.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [5, 18361.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [10, 22777.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [5, 19272.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [26, 22092.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [28, 18805.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 22479.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [13, 18422.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [13, 23710.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [29, 22291.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [16, 23282.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [13, 23845.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 21053.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [13, 22578.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [22, 24555.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [15, 17212.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [28, 18941.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [5, 22191.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [16, 21805.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [28, 18469.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [31, 20002.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [11, 23990.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [26, 24636.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [10, 23979.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [7, 20635.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [15, 18137.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [22, 18681.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [5, 23124.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 20491.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [16, 20890.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [23, 24837.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [16, 19039.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [32, 23546.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [5, 23943.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [7, 20023.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 20370.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [13, 15735.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [5, 23428.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [16, 21592.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [5, 22807.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [19, 17401.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [5, 18937.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [5, 23498.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 24287.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 22047.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [31, 22218.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [26, 23306.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [22, 20108.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [13, 24228.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [13, 17488.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 23877.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [5, 19163.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [13, 16373.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [19, 17645.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [7, 19428.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [13, 23346.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [21, 17320.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [5, 23632.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [13, 19135.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [10, 24460.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [5, 21117.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [21, 16106.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 20201.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [25, 20327.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [5, 23655.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [13, 21133.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [20, 23438.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [5, 20322.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [5, 24303.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [5, 20182.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [19, 21734.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [5, 18513.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [31, 14058.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [25, 18875.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [5, 24343.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [19, 23753.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [5, 19149.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [32, 23217.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [13, 17592.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [5, 24464.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [22, 23855.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [13, 18316.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [10, 17608.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [2, 19041.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [22, 14737.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 23189.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [16, 24223.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [19, 22745.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 18040.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [22, 21491.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [17, 24259.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [15, 14612.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [10, 16710.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [23, 24192.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [26, 24503.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [5, 20981.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 21700.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [13, 21687.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [11, 22100.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [11, 24751.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [26, 25060.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [9, 16526.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [14, 22119.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [10, 23573.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [5, 24123.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [26, 24726.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [0, 20210.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 23331.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [1, 14357.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [35, 23793.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [13, 21424.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [19, 20801.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [28, 20346.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [8, 24686.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [30, 16567.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [10, 22579.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [5, 24582.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [23, 24441.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [5, 24111.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [24, 18721.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [10, 24882.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [13, 22038.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [13, 23283.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [5, 23745.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [10, 18552.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [25, 22802.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [13, 22359.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [10, 23140.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [16, 18336.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [26, 22669.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [25, 18845.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [5, 24385.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [19, 19860.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [2, 19525.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [5, 22453.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [3, 21019.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [9, 15284.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [22, 22747.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [5, 24351.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [10, 23346.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [5, 22946.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 20240.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [13, 20209.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [25, 22639.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [10, 22927.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [13, 22754.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [14, 24529.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [23, 24255.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [7, 20634.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [16, 20343.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [11, 24336.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [13, 16571.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 18277.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [5, 21788.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 23826.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [16, 23641.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [26, 20671.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [13, 23395.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [5, 23117.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [13, 23832.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [31, 23028.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [19, 20416.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [5, 24923.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [5, 18249.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [13, 18552.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [13, 18983.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [5, 24034.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [13, 19720.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [10, 24130.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [22, 21388.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [18, 16759.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [5, 23275.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [5, 21686.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [13, 21493.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [5, 21868.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [5, 24155.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [16, 20970.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [7, 19210.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [5, 20326.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [8, 24001.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [7, 19536.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [23, 20601.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [25, 20481.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [15, 17168.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [5, 24498.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [25, 23637.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [29, 22360.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [16, 19651.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [25, 22631.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [5, 17545.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [31, 22655.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 25122.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [5, 18022.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [35, 23316.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [47, 16529.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [41, 8066.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [43, 17361.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [50, 12373.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [58, 16894.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [47, 16382.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [37, 12404.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [49, 6532.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [56, 6606.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [42, 15166.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [52, 11627.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [49, 12557.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [57, 14109.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [46, 12378.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [47, 15260.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [57, 16852.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [50, 14643.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [46, 11253.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [50, 10900.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [51, 16790.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [57, 8113.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [51, 14120.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [46, 7178.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [56, 9919.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [41, 12540.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [51, 15402.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [43, 18297.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [42, 11875.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [50, 10755.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [58, 14141.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [42, 7083.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [46, 13716.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [37, 11740.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [57, 12556.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [54, 17495.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [54, 14461.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [43, 18278.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [50, 15475.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [57, 11759.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [51, 15438.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [57, 8610.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [37, 13692.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [47, 12642.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [57, 15719.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [50, 12223.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [39, 16545.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [49, 7021.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [47, 15476.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [54, 15628.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [57, 10251.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [53, 14814.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [47, 15784.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [38, 15757.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [42, 11808.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [50, 13355.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [58, 17259.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [39, 15543.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [57, 15504.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [47, 15606.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [44, 13315.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [43, 12188.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [49, 6554.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [50, 15681.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [50, 11152.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [45, 12497.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [53, 15861.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [54, 12728.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [50, 16010.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [42, 13159.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [51, 15451.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [47, 12858.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [51, 11908.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [49, 11720.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [45, 13398.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [54, 15735.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [50, 11164.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [43, 16890.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [42, 13066.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [58, 15953.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [48, 9426.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [51, 16029.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [51, 19054.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [57, 11297.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [57, 13639.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [53, 13243.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [39, 16576.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [42, 11050.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [49, 6648.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [43, 18625.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [50, 13446.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [51, 12751.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [50, 7253.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [57, 15083.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [53, 13922.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [53, 15899.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [54, 14819.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [37, 13363.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [57, 15047.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [51, 13138.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [39, 11808.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [49, 11218.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [51, 13856.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [57, 14022.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [58, 18197.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [41, 6586.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [57, 15503.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [50, 12792.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [52, 13981.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [46, 14596.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [43, 9885.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [58, 17223.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [45, 11695.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [38, 14545.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [50, 9105.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [51, 15793.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [42, 12221.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [47, 11859.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [58, 16946.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [42, 11192.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [42, 10486.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [42, 12817.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [47, 15923.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [57, 6445.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [41, 8341.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [42, 11029.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [58, 16462.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [42, 6917.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [50, 6991.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [47, 16327.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [38, 16176.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [50, 13902.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [52, 8995.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [54, 17196.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [46, 15298.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [51, 17450.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [42, 11349.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [42, 13210.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [43, 15392.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [52, 10057.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [42, 10742.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [39, 13000.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [50, 12056.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [51, 17175.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [50, 11392.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [51, 16668.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [58, 16795.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [49, 10212.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [50, 12330.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [50, 16110.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [51, 14208.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [57, 15006.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [58, 15331.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [58, 12228.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [42, 12084.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [43, 14986.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [39, 15981.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [41, 8751.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [40, 10526.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [39, 16351.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [53, 15161.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [51, 14068.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [52, 13075.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [55, 10972.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [47, 16335.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [83, 4708.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [70, 4673.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [94, 5326.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [67, 4474.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [75, 6650.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [75, 6934.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [70, 5348.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [85, 5497.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [96, 4349.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [63, 5837.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [75, 5949.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [61, 7584.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [67, 5421.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [85, 4740.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [99, 5608.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [75, 6809.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [67, 5422.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [96, 3390.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [89, 5412.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [85, 6754.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [70, 4208.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [75, 6095.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [94, 4640.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [90, 7917.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [63, 3774.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [77, 5347.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [96, 6007.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [75, 7278.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [96, 6279.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [87, 4662.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [89, 4875.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [94, 4428.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [59, 6321.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [96, 6585.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [67, 4419.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [74, 5010.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [90, 6880.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [88, 6651.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [75, 6022.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [75, 4552.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [94, 2834.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [85, 5474.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [75, 6913.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [96, 7439.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [67, 5319.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [94, 3503.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [61, 6904.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [85, 6961.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [61, 6405.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [61, 6977.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [90, 7454.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [83, 4403.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [67, 3848.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [67, 4345.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [89, 4203.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [78, 4453.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [75, 5987.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [75, 5397.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [75, 6158.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [75, 7413.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [61, 6594.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [96, 5851.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [73, 8983.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [90, 6585.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [75, 6535.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [83, 2793.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [96, 4389.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [61, 6801.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [96, 6643.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [67, 5413.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [90, 6568.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [74, 4879.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [85, 6638.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [90, 7701.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [75, 7013.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [75, 6056.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [67, 3902.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [70, 4795.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [94, 2800.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [75, 6921.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [96, 5234.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [89, 6257.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [61, 6450.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [89, 5228.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [67, 5509.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [85, 5363.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [90, 6782.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [61, 6129.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [75, 7157.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [96, 5432.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [83, 2740.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [75, 6048.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [61, 7914.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [59, 5746.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [96, 4272.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [75, 4893.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [67, 4775.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [101, 3650.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [69, 2415.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [102, 3679.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [102, 3571.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [102, 3512.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [102, 2290.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [98, 2532.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [103, 3134.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [102, 2969.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [82, 3326.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [104, 3144.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [100, 1501.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [79, 759.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [64, 966.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [64, 482.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [92, 2921.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [62, 2588.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [74, 3732.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [92, 3077.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [66, 2056.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [89, 4709.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [86, 2214.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [79, 485.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [72, 3212.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [97, 351.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [95, 3505.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [93, 2922.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [91, 2632.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [76, 755.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [62, 916.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [76, 1724.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [92, 1327.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [81, 2225.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [72, 681.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [83, 2179.0] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [80, 1677.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [84, 2050.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [92, 3419.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [91, 2624.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [60, 3996.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [91, 1427.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [97, 173.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [92, 1154.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [80, 2416.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [92, 2886.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [68, 748.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [97, 1436.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [92, 1327.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [68, 1436.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [82, 1681.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [100, 379.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [86, 918.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [97, 1165.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [94, 2156.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [76, 345.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [83, 3844.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [89, 4479.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [84, 3489.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [59, 4455.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [91, 1733.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [71, 3873.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [80, 2861.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [80, 690.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [89, 4104.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [91, 1728.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [65, 668.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [59, 3349.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [62, 1718.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [91, 1716.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [74, 3336.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [64, 480.0] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 000000000..63d9d94b3 --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,79574 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x8x16_SN_SU32_SUM3_TT2_2_WG32_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x8x16_SN_SU32_SUM3_TT4_1_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SN_SU0_SUM0_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 308 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 128 + LdsOffsetB_Blk: 640 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 309 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x8_SN_SU32_SUM3_TT2_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 310 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 128 + LdsOffsetB_Blk: 1152 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 311 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 +- [2, 3, 0, 1] +- - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 11616.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 12032.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 11151.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 1024, 1024] + - [49, 12402.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10498.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11908.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10804.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 1024, 1024] + - [49, 12317.0] + - - [30522, 320, 1, 768, 30522, 30522, 768, 768] + - [47, 9984.0] + - - [3072, 4096, 1, 768, 3072, 3072, 768, 768] + - [28, 11969.0] + - - [768, 4096, 1, 3072, 768, 768, 3072, 3072] + - [49, 11734.0] + - - [768, 4096, 1, 768, 768, 768, 768, 768] + - [49, 11396.0] + - - [30522, 160, 1, 768, 30522, 30522, 768, 768] + - [27, 8249.0] + - - [30522, 640, 1, 768, 30522, 30522, 768, 768] + - [12, 12048.0] + - - [30522, 1280, 1, 768, 30522, 30522, 768, 768] + - [49, 12200.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 11288.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] + - [12, 10811.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] + - [14, 11749.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] + - [28, 11603.0] + - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] + - [28, 12017.0] + - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 10794.0] + - - [30522, 160, 1, 1024, 30522, 30522, 1024, 1024] + - [47, 7451.0] + - - [128, 128, 512, 64, 128, 128, 64, 64] + - [48, 9425.0] + - - [512, 512, 64, 64, 512, 512, 64, 64] + - [4, 10583.0] + - - [256, 256, 192, 64, 256, 256, 64, 64] + - [36, 10346.0] + - - [256, 256, 96, 64, 256, 256, 64, 64] + - [36, 9893.0] + - - [128, 128, 384, 64, 128, 128, 64, 64] + - [39, 9342.0] + - - [128, 128, 96, 64, 128, 128, 64, 64] + - [36, 7391.0] + - - [512, 512, 16, 64, 512, 512, 64, 64] + - [7, 9628.0] + - - [512, 512, 96, 64, 512, 512, 64, 64] + - [0, 10441.0] + - - [512, 512, 128, 64, 512, 512, 64, 64] + - [0, 7709.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [30, 11908.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [12, 11418.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [20, 11280.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [30, 12142.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [28, 11399.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 11673.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [9, 9737.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 12248.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [20, 10851.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 128] + - [20, 10278.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 12145.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [30, 11986.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 128] + - [17, 11102.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [14, 11488.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [49, 12248.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [28, 11835.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [28, 9715.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 10455.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [6, 12420.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [14, 9910.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [9, 7586.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [38, 10217.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 128] + - [9, 7911.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [49, 9768.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 10972.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [9, 7460.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [12, 10500.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [28, 9449.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [30, 10439.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [28, 9800.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [29, 8590.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [47, 9115.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [9, 9879.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [14, 10129.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [26, 9443.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [12, 11212.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [28, 11484.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 128] + - [9, 10818.0] + - - [704, 5056, 1, 128, 704, 704, 128, 128] + - [18, 9309.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [47, 11659.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 11077.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 11211.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [14, 8807.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [28, 9173.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 10713.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 8327.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [38, 9433.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 128] + - [18, 9914.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [38, 11300.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [22, 10568.0] + - - [448, 5888, 1, 128, 448, 448, 128, 128] + - [18, 7893.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [20, 9885.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [22, 9174.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [20, 12031.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [9, 8689.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [12, 10545.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 128] + - [26, 10617.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 10575.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [39, 8354.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [20, 10946.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [28, 11166.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [28, 9672.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 128] + - [47, 10231.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [47, 11212.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [30, 11144.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [28, 10833.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [47, 8709.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 9904.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [28, 9738.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [25, 8372.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 128] + - [4, 9860.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [14, 9729.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [47, 10513.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 11258.0] + - - [704, 2944, 1, 128, 704, 704, 128, 128] + - [25, 7952.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 9787.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [28, 9850.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [12, 9889.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 128] + - [26, 10188.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 8113.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [14, 10344.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [47, 10523.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [28, 11408.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [30, 8562.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [20, 9565.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [20, 11759.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [12, 9714.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [26, 9017.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [28, 10932.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [12, 8542.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [28, 11484.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [12, 9505.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [47, 9627.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 11807.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [28, 10823.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 128] + - [44, 10456.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [14, 10503.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 128] + - [25, 10059.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 128] + - [26, 10136.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 9390.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [38, 11103.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [20, 10171.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 128] + - [17, 10138.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 9411.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [4, 7713.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [47, 10577.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [47, 6996.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [30, 8748.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [12, 10297.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [28, 10287.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [20, 10089.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [14, 9643.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [28, 11326.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [26, 9720.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 128] + - [43, 8337.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [12, 9958.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [28, 7616.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [28, 9727.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [14, 11277.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [12, 7503.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [12, 10359.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [12, 10122.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 128] + - [44, 11166.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 128] + - [18, 10345.0] + - - [448, 3584, 1, 128, 448, 448, 128, 128] + - [18, 8028.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [12, 9577.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 128] + - [8, 9780.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [28, 11742.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 128] + - [26, 8560.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 128] + - [18, 7776.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [28, 11571.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 128] + - [28, 10064.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 128] + - [43, 8519.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 8945.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 8718.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [28, 10648.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [20, 10129.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [26, 8629.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 128] + - [12, 9904.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 9192.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 10109.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [20, 11358.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [20, 11167.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [38, 10339.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [14, 10409.0] + - - [256, 5056, 1, 128, 256, 256, 128, 128] + - [0, 6886.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [20, 10573.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [38, 9261.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [26, 8312.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [20, 12128.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 128] + - [9, 8633.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [14, 9226.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [47, 11355.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9241.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 11046.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [28, 11190.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 128] + - [38, 10838.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [30, 10799.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [14, 10540.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [12, 9876.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [47, 7955.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 10663.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 128] + - [36, 9287.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 128] + - [28, 10529.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [28, 11404.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [26, 9934.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [26, 9790.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 11217.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [26, 7593.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [30, 8988.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [11, 8940.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [47, 11309.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [9, 7852.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [12, 9507.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 128] + - [36, 6967.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [20, 11572.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 128] + - [4, 11754.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 128] + - [9, 8541.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 10136.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [20, 11278.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [4, 11540.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 128] + - [38, 10793.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 128] + - [26, 8963.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 9885.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [28, 11962.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 128] + - [35, 10307.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [12, 11128.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [28, 11487.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 128] + - [17, 11766.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [47, 11750.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [49, 12268.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [46, 6671.0] + - - [448, 4288, 1, 128, 448, 448, 128, 128] + - [26, 7474.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [28, 8616.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 128] + - [18, 10060.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [28, 10462.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 10678.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [20, 10993.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [28, 11548.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [28, 11650.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [20, 10691.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [47, 11790.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 128] + - [35, 10429.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [49, 9597.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [28, 11669.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 128] + - [18, 10306.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [11, 9358.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 128] + - [4, 10566.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [12, 10819.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [47, 10034.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [17, 11588.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [20, 9215.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11246.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [47, 11492.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [14, 10752.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [28, 12025.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [30, 12136.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 128] + - [20, 11611.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [22, 10080.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [9, 10057.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [49, 11461.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 7451.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [28, 8273.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [20, 10265.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [12, 9789.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 128] + - [36, 9057.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [28, 11269.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [20, 10067.0] + - - [704, 6784, 1, 128, 704, 704, 128, 128] + - [18, 9399.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [47, 9842.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [38, 11357.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 128] + - [38, 11468.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [46, 9413.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [14, 10018.0] + - - [256, 5888, 1, 128, 256, 256, 128, 128] + - [21, 7484.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [20, 11222.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [28, 11691.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [40, 9427.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [30, 7439.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 128] + - [0, 3607.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [47, 11178.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 128] + - [43, 8119.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [12, 11115.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 128] + - [0, 3778.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [28, 7030.0] + - - [448, 5056, 1, 128, 448, 448, 128, 128] + - [9, 6574.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [28, 11574.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 128] + - [28, 10551.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 128] + - [38, 11074.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [28, 11080.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [14, 12123.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [38, 10316.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [28, 11950.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [28, 10239.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [9, 9360.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 10927.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [28, 10745.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [9, 7963.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [28, 11861.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [28, 9483.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 128] + - [20, 11152.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 128] + - [9, 8858.0] + - - [704, 3584, 1, 128, 704, 704, 128, 128] + - [26, 8715.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [28, 7373.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [12, 11183.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 128] + - [36, 10534.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [4, 8281.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 10195.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [20, 10401.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [12, 10156.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 128] + - [4, 10414.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [46, 8824.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [20, 10622.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [28, 9685.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 9603.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [9, 10148.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [12, 9581.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 128] + - [20, 11092.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 10459.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [12, 11813.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [47, 10643.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 8423.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [12, 10391.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [47, 9326.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 128] + - [9, 7954.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 128] + - [26, 7203.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 128] + - [26, 9643.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 128] + - [38, 10757.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [14, 11211.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 128] + - [9, 9296.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [14, 10033.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [26, 9916.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [12, 10731.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [4, 11428.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 128] + - [26, 9818.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [28, 10581.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [40, 8886.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [28, 8111.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 128] + - [9, 9106.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [28, 9982.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [28, 9362.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 128] + - [24, 6016.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 10616.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [28, 10881.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [20, 11003.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [12, 10422.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [47, 10764.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [30, 10623.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [28, 10292.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [12, 8654.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [26, 9637.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [49, 10617.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 128] + - [9, 8625.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [12, 11378.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [12, 11370.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [26, 10047.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [12, 9868.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [12, 11604.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [38, 10879.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [47, 10521.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [47, 11973.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [14, 11226.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [29, 6201.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [28, 11069.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [20, 10071.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [28, 9561.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 8561.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [28, 10569.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [15, 8010.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 128] + - [36, 9753.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [28, 10615.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [47, 10070.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10335.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [30, 9561.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [20, 9595.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [49, 10529.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [49, 9982.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [26, 8957.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [12, 9729.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [12, 9028.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [28, 11792.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 128] + - [9, 8995.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [47, 10698.0] + - - [448, 6784, 1, 128, 448, 448, 128, 128] + - [18, 6806.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [47, 11859.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 128] + - [17, 10993.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [47, 11539.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 128] + - [9, 7747.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [49, 12421.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [12, 10542.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [14, 11728.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [11, 9146.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [12, 11761.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [12, 10064.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [28, 10804.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [14, 12291.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [49, 12290.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [49, 11925.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [12, 11093.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [45, 10142.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [9, 9357.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 128] + - [17, 9938.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 128] + - [35, 11071.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [20, 11699.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [28, 11303.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 128] + - [35, 11111.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [14, 12132.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 128] + - [9, 9593.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [14, 12213.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [9, 9333.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [12, 11807.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [49, 11944.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [12, 10347.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [28, 10823.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [45, 9498.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [14, 11221.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [9, 9375.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [12, 10582.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 128] + - [26, 10802.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 11251.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 128] + - [17, 11141.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [38, 11465.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [12, 11891.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 128] + - [35, 11439.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [47, 12161.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 128] + - [17, 10330.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 128] + - [1, 10303.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 128] + - [19, 9119.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [14, 10313.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 128] + - [35, 9689.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [20, 9397.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [12, 10176.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 128] + - [28, 10172.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [12, 10608.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [47, 11774.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [12, 11775.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [26, 9402.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 11897.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 128] + - [20, 10894.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [30, 12192.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [12, 11693.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 11665.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 9649.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [47, 11112.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 128] + - [17, 10300.0] + - - [256, 6784, 1, 128, 256, 256, 128, 128] + - [24, 9103.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 128] + - [12, 11325.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 128] + - [44, 11151.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 128] + - [18, 10678.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [38, 11947.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [12, 11053.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [49, 12056.0] + - - [704, 5888, 1, 128, 704, 704, 128, 128] + - [9, 9588.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 128] + - [35, 11667.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [14, 12121.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 128] + - [9, 9141.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [4, 11299.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [28, 10071.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [28, 8779.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [49, 10974.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [28, 11728.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [12, 10198.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [28, 12207.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [20, 11924.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 128] + - [17, 10849.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [12, 10561.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [12, 11849.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [14, 11986.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [26, 9605.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [22, 12374.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 128] + - [35, 10671.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 128] + - [4, 10959.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 10164.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [49, 9366.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [12, 9787.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [4, 10239.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 128] + - [4, 11406.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [26, 8954.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [14, 12278.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 11876.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [17, 12187.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [28, 9581.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [30, 11113.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 128] + - [9, 9113.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 128] + - [44, 11156.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [6, 12454.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [9, 9154.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 9215.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 128] + - [7, 8859.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [20, 12031.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [49, 11098.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [47, 11704.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [12, 11053.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [22, 12171.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [30, 11993.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [6, 12491.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [47, 11161.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [6, 12374.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [20, 10933.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [47, 11761.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 128] + - [20, 11484.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 128] + - [18, 9997.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [30, 12035.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [47, 12238.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 128] + - [4, 11509.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [35, 10549.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 11896.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [20, 10062.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [45, 9885.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [22, 9849.0] + - - [704, 2368, 1, 128, 704, 704, 128, 128] + - [26, 8481.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 128] + - [26, 9170.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 128] + - [12, 10463.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 128] + - [35, 11222.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [14, 11611.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [20, 10867.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [22, 12420.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 128] + - [4, 11358.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [28, 11089.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [14, 12305.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [11, 9052.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [28, 12137.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 128] + - [4, 11759.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [14, 12451.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [6, 12121.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [12, 10649.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [14, 12343.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 128] + - [26, 9150.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [47, 10457.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [6, 12213.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [30, 12326.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [38, 11477.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 128] + - [38, 11553.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [20, 11743.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [26, 9763.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [20, 10742.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [47, 11625.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [20, 11498.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [28, 10056.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [28, 11495.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [28, 10166.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [14, 12155.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [47, 11642.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [12, 10099.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [28, 9951.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [26, 9896.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [47, 11258.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 11156.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [9, 10170.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [30, 12017.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [12, 11658.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [47, 10328.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [12, 11931.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [47, 11625.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 12093.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [28, 11137.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 128] + - [38, 11156.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [12, 11471.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [20, 11475.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [6, 12322.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [28, 12243.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [20, 10853.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [38, 10658.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 128] + - [26, 10682.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [35, 10081.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 128] + - [18, 9156.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 128] + - [38, 9581.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 128] + - [18, 9906.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [14, 11457.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [28, 11688.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [28, 11538.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [6, 12490.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [30, 12098.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [28, 11225.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [12, 11739.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [12, 11967.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [20, 11917.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [47, 9628.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 128] + - [20, 9785.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [12, 11560.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [12, 10483.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [20, 12288.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [14, 9658.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 128] + - [43, 9117.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [12, 9997.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 128] + - [28, 11189.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [28, 10239.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [30, 12060.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [12, 10715.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [30, 10526.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 128] + - [28, 11072.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [40, 12092.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [26, 8810.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 128] + - [20, 10687.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [28, 11591.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [30, 11122.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [6, 12240.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [12, 11016.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [14, 11226.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [14, 11261.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 12077.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [30, 11219.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [12, 11321.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [14, 10283.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [28, 11049.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 128] + - [38, 10955.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [30, 12238.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [12, 10384.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [47, 11342.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 128] + - [18, 9664.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 128] + - [36, 9038.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [14, 12233.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [17, 12105.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [28, 11653.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [28, 10961.0] + - - [704, 4288, 1, 128, 704, 704, 128, 128] + - [26, 8953.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [20, 11629.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [47, 9632.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [30, 11519.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 128] + - [35, 10529.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [14, 11904.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [49, 12126.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [30, 10680.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [12, 12294.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [47, 9908.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [30, 12306.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 128] + - [9, 10271.0] + - - [4096, 512, 1, 32, 4096, 4096, 32, 32] + - [43, 5897.0] + - - [2048, 1024, 1, 1664, 2048, 2048, 1664, 1664] + - [4, 10770.0] + - - [4096, 512, 1, 1408, 4096, 4096, 1408, 1408] + - [25, 10838.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 1280, 1280] + - [28, 11774.0] + - - [2048, 1024, 1, 640, 2048, 2048, 640, 640] + - [25, 10622.0] + - - [4096, 1024, 1, 13312, 4096, 4096, 13312, 13312] + - [47, 11158.0] + - - [2048, 1024, 1, 13312, 2048, 2048, 13312, 13312] + - [47, 10319.0] + - - [2048, 1024, 1, 3584, 2048, 2048, 3584, 3584] + - [12, 10785.0] + - - [4096, 1024, 1, 1920, 4096, 4096, 1920, 1920] + - [20, 11855.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 12288, 12288] + - [47, 10683.0] + - - [4096, 1024, 1, 8320, 4096, 4096, 8320, 8320] + - [20, 11898.0] + - - [4096, 1024, 1, 15360, 4096, 4096, 15360, 15360] + - [47, 10863.0] + - - [4096, 512, 1, 3072, 4096, 4096, 3072, 3072] + - [12, 10786.0] + - - [4096, 512, 1, 13312, 4096, 4096, 13312, 13312] + - [28, 9599.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 3840, 3840] + - [28, 11832.0] + - - [2048, 1024, 1, 3200, 2048, 2048, 3200, 3200] + - [4, 10820.0] + - - [4096, 512, 1, 3840, 4096, 4096, 3840, 3840] + - [28, 10823.0] + - - [4096, 512, 1, 5632, 4096, 4096, 5632, 5632] + - [12, 10819.0] + - - [4096, 512, 1, 64, 4096, 4096, 64, 64] + - [24, 8125.0] + - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] + - [12, 10417.0] + - - [4096, 512, 1, 8192, 4096, 4096, 8192, 8192] + - [50, 9040.0] + - - [4096, 512, 1, 2304, 4096, 4096, 2304, 2304] + - [28, 10757.0] + - - [4096, 512, 1, 2816, 4096, 4096, 2816, 2816] + - [12, 10812.0] + - - [2048, 1024, 1, 7680, 2048, 2048, 7680, 7680] + - [12, 10665.0] + - - [4096, 512, 1, 1920, 4096, 4096, 1920, 1920] + - [25, 10760.0] + - - [4096, 1024, 1, 32, 4096, 4096, 32, 32] + - [7, 6560.0] + - - [4096, 512, 1, 16640, 4096, 4096, 16640, 16640] + - [28, 10830.0] + - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] + - [28, 10495.0] + - - [4096, 512, 1, 1792, 4096, 4096, 1792, 1792] + - [47, 10756.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 8192, 8192] + - [49, 9929.0] + - - [2048, 1024, 1, 4160, 2048, 2048, 4160, 4160] + - [17, 11107.0] + - - [4096, 512, 1, 10240, 4096, 4096, 10240, 10240] + - [47, 9967.0] + - - [4096, 512, 1, 512, 4096, 4096, 512, 512] + - [47, 10170.0] + - - [2048, 1024, 1, 6656, 2048, 2048, 6656, 6656] + - [12, 10849.0] + - - [2048, 1024, 1, 14336, 2048, 2048, 14336, 14336] + - [47, 10283.0] + - - [4096, 512, 1, 11264, 4096, 4096, 11264, 11264] + - [28, 9926.0] + - - [4096, 512, 1, 128, 4096, 4096, 128, 128] + - [18, 9218.0] + - - [4096, 512, 1, 768, 4096, 4096, 768, 768] + - [47, 10605.0] + - - [4096, 1024, 1, 11264, 4096, 4096, 11264, 11264] + - [47, 11132.0] + - - [4096, 1024, 1, 16640, 4096, 4096, 16640, 16640] + - [47, 11856.0] + - - [2048, 1024, 1, 5632, 2048, 2048, 5632, 5632] + - [12, 10825.0] + - - [4096, 512, 1, 12288, 4096, 4096, 12288, 12288] + - [28, 9385.0] + - - [4096, 1024, 1, 5632, 4096, 4096, 5632, 5632] + - [12, 11830.0] + - - [2048, 1024, 1, 10240, 2048, 2048, 10240, 10240] + - [12, 10496.0] + - - [4096, 1024, 1, 640, 4096, 4096, 640, 640] + - [25, 11607.0] + - - [2048, 1024, 1, 12288, 2048, 2048, 12288, 12288] + - [47, 9771.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 10240, 10240] + - [47, 10926.0] + - - [2048, 1024, 1, 4608, 2048, 2048, 4608, 4608] + - [12, 10797.0] + - - [4096, 512, 1, 3584, 4096, 4096, 3584, 3584] + - [12, 10801.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4608, 4608] + - [12, 11805.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 3328, 3328] + - [28, 11818.0] + - - [2048, 1024, 1, 9216, 2048, 2048, 9216, 9216] + - [12, 10621.0] + - - [2048, 1024, 1, 2304, 2048, 2048, 2304, 2304] + - [12, 10729.0] + - - [4096, 512, 1, 6144, 4096, 4096, 6144, 6144] + - [12, 10574.0] + - - [4096, 512, 1, 15360, 4096, 4096, 15360, 15360] + - [47, 9423.0] + - - [4096, 1024, 1, 7168, 4096, 4096, 7168, 7168] + - [12, 10752.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 9216, 9216] + - [49, 10826.0] + - - [4096, 1024, 1, 7680, 4096, 4096, 7680, 7680] + - [47, 11503.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 8192, 8192] + - [12, 9406.0] + - - [4096, 1024, 1, 64, 4096, 4096, 64, 64] + - [2, 9156.0] + - - [2048, 1024, 1, 1280, 2048, 2048, 1280, 1280] + - [12, 10684.0] + - - [2048, 1024, 1, 3328, 2048, 2048, 3328, 3328] + - [12, 10832.0] + - - [4096, 512, 1, 14336, 4096, 4096, 14336, 14336] + - [47, 10075.0] + - - [4096, 512, 1, 8320, 4096, 4096, 8320, 8320] + - [28, 10858.0] + - - [4096, 1024, 1, 6656, 4096, 4096, 6656, 6656] + - [12, 11813.0] + - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] + - [9, 9761.0] + - - [4096, 512, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10617.0] + - - [4096, 1024, 1, 1536, 4096, 4096, 1536, 1536] + - [28, 11797.0] + - - [2048, 1024, 1, 32, 2048, 2048, 32, 32] + - [43, 5678.0] + - - [4096, 512, 1, 640, 4096, 4096, 640, 640] + - [44, 10649.0] + - - [4096, 512, 1, 16384, 4096, 4096, 16384, 16384] + - [32, 9161.0] + - - [4096, 1024, 1, 512, 4096, 4096, 512, 512] + - [28, 11379.0] + - - [2048, 1024, 1, 1152, 2048, 2048, 1152, 1152] + - [20, 10715.0] + - - [4096, 1024, 1, 2080, 4096, 4096, 2080, 2080] + - [17, 12141.0] + - - [4096, 1024, 1, 768, 4096, 4096, 768, 768] + - [28, 11656.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 2560, 2560] + - [12, 11798.0] + - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] + - [0, 8410.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 16384, 16384] + - [41, 9976.0] + - - [4096, 512, 1, 6656, 4096, 4096, 6656, 6656] + - [12, 10824.0] + - - [2048, 1024, 1, 128, 2048, 2048, 128, 128] + - [9, 9032.0] + - - [2048, 1024, 1, 2080, 2048, 2048, 2080, 2080] + - [17, 11102.0] + - - [2048, 1024, 1, 16640, 2048, 2048, 16640, 16640] + - [47, 10819.0] + - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] + - [12, 10746.0] + - - [4096, 1024, 1, 1408, 4096, 4096, 1408, 1408] + - [25, 11784.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 2048, 2048] + - [12, 11781.0] + - - [2048, 1024, 1, 2560, 2048, 2048, 2560, 2560] + - [12, 10772.0] + - - [4096, 1024, 1, 128, 4096, 4096, 128, 128] + - [12, 10453.0] + - - [4096, 1024, 1, 14336, 4096, 4096, 14336, 14336] + - [47, 11165.0] + - - [4096, 512, 1, 9216, 4096, 4096, 9216, 9216] + - [28, 9872.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 10677.0] + - - [4096, 512, 1, 1536, 4096, 4096, 1536, 1536] + - [12, 10724.0] + - - [2048, 1024, 1, 16384, 2048, 2048, 16384, 16384] + - [50, 9196.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11611.0] + - - [4096, 1024, 1, 1664, 4096, 4096, 1664, 1664] + - [25, 11876.0] + - - [4096, 512, 1, 384, 4096, 4096, 384, 384] + - [44, 10388.0] + - - [4096, 512, 1, 3328, 4096, 4096, 3328, 3328] + - [47, 10799.0] + - - [4096, 1024, 1, 256, 4096, 4096, 256, 256] + - [28, 11215.0] + - - [2048, 1024, 1, 7168, 2048, 2048, 7168, 7168] + - [12, 10830.0] + - - [2048, 1024, 1, 1536, 2048, 2048, 1536, 1536] + - [12, 10616.0] + - - [4096, 512, 1, 7168, 4096, 4096, 7168, 7168] + - [12, 10479.0] + - - [4096, 1024, 1, 896, 4096, 4096, 896, 896] + - [25, 11784.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 4096] + - [12, 11639.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 6144, 6144] + - [12, 10841.0] + - - [4096, 512, 1, 4160, 4096, 4096, 4160, 4160] + - [35, 11070.0] + - - [4096, 512, 1, 2080, 4096, 4096, 2080, 2080] + - [35, 11065.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 5120, 5120] + - [12, 11722.0] + - - [2048, 1024, 1, 1920, 2048, 2048, 1920, 1920] + - [20, 10723.0] + - - [2048, 1024, 1, 15360, 2048, 2048, 15360, 15360] + - [47, 9602.0] + - - [4096, 1024, 1, 2816, 4096, 4096, 2816, 2816] + - [12, 11828.0] + - - [4096, 512, 1, 256, 4096, 4096, 256, 256] + - [28, 9898.0] + - - [2048, 1024, 1, 5120, 2048, 2048, 5120, 5120] + - [12, 10822.0] + - - [2048, 1024, 1, 4096, 2048, 2048, 4096, 4096] + - [12, 10741.0] + - - [4096, 512, 1, 4608, 4096, 4096, 4608, 4608] + - [12, 10821.0] + - - [4096, 512, 1, 1664, 4096, 4096, 1664, 1664] + - [25, 10793.0] + - - [2048, 1024, 1, 896, 2048, 2048, 896, 896] + - [25, 10713.0] + - - [4096, 1024, 1, 4160, 4096, 4096, 4160, 4160] + - [17, 12131.0] + - - [2048, 1024, 1, 11264, 2048, 2048, 11264, 11264] + - [47, 10365.0] + - - [2048, 1024, 1, 384, 2048, 2048, 384, 384] + - [26, 10109.0] + - - [2048, 1024, 1, 3840, 2048, 2048, 3840, 3840] + - [28, 10830.0] + - - [4096, 512, 1, 1280, 4096, 4096, 1280, 1280] + - [47, 10707.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 1152, 1152] + - [25, 11854.0] + - - [2048, 1024, 1, 1408, 2048, 2048, 1408, 1408] + - [20, 10728.0] + - - [4096, 512, 1, 896, 4096, 4096, 896, 896] + - [44, 10726.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 3072, 3072] + - [12, 11811.0] + - - [2048, 1024, 1, 2816, 2048, 2048, 2816, 2816] + - [28, 10778.0] + - - [4096, 1024, 1, 1792, 4096, 4096, 1792, 1792] + - [28, 11805.0] + - - [4096, 512, 1, 1152, 4096, 4096, 1152, 1152] + - [44, 10808.0] + - - [4096, 512, 1, 7680, 4096, 4096, 7680, 7680] + - [28, 10556.0] + - - [4096, 1024, 1, 384, 4096, 4096, 384, 384] + - [25, 11323.0] + - - [2048, 1024, 1, 1792, 2048, 2048, 1792, 1792] + - [12, 10739.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 3584, 3584] + - [12, 11835.0] + - - [2048, 1024, 1, 768, 2048, 2048, 768, 768] + - [28, 10435.0] + - - [2048, 1024, 1, 8320, 2048, 2048, 8320, 8320] + - [20, 10860.0] + - - [4096, 512, 1, 2048, 4096, 4096, 2048, 2048] + - [12, 10687.0] + - - [4096, 512, 1, 2560, 4096, 4096, 2560, 2560] + - [28, 10807.0] + - - [4096, 1024, 1, 2304, 4096, 4096, 2304, 2304] + - [12, 11812.0] + - - [4096, 512, 1, 5120, 4096, 4096, 5120, 5120] + - [12, 10714.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 6144, 6144] + - [12, 11577.0] + - - [1024, 3392, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10841.0] + - - [1024, 3301, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10482.0] + - - [1024, 3443, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10984.0] + - - [132, 134, 480, 64, 132, 132, 64, 64] + - [7, 3878.0] + - - [162, 162, 400, 64, 162, 162, 64, 64] + - [34, 5451.0] + - - [4096, 3548, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11961.0] + - - [4096, 2977, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11610.0] + - - [132, 135, 480, 64, 132, 132, 64, 64] + - [16, 3906.0] + - - [1024, 2985, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11407.0] + - - [33708, 3681, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12127.0] + - - [4096, 3443, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11917.0] + - - [1024, 3400, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10890.0] + - - [4096, 3995, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11790.0] + - - [4096, 3190, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12067.0] + - - [4096, 3594, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11678.0] + - - [159, 162, 400, 64, 159, 159, 64, 64] + - [16, 5395.0] + - - [1024, 3565, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11270.0] + - - [4096, 3422, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11844.0] + - - [1024, 3214, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10235.0] + - - [33708, 3584, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 12339.0] + - - [33708, 3640, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 11988.0] + - - [4096, 3263, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11823.0] + - - [4096, 3296, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11950.0] + - - [1024, 3557, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11343.0] + - - [4096, 3463, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11697.0] + - - [4096, 3528, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11906.0] + - - [4096, 3226, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11675.0] + - - [4096, 3439, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11913.0] + - - [1024, 3523, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11192.0] + - - [1024, 3098, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10701.0] + - - [4096, 3121, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11796.0] + - - [33708, 3894, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12019.0] + - - [1024, 3548, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11325.0] + - - [1024, 3451, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11046.0] + - - [4096, 3353, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11621.0] + - - [4096, 3402, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11807.0] + - - [4096, 3939, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11999.0] + - - [133, 133, 480, 64, 133, 133, 64, 64] + - [24, 3869.0] + - - [1024, 3559, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11294.0] + - - [1024, 2977, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11378.0] + - - [1024, 3478, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11115.0] + - - [134, 134, 480, 64, 134, 134, 64, 64] + - [34, 3942.0] + - - [1024, 3368, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10782.0] + - - [4096, 4012, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11768.0] + - - [4096, 3486, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11778.0] + - - [1024, 3479, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11124.0] + - - [1024, 3505, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11227.0] + - - [4096, 3381, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11729.0] + - - [4096, 3430, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11875.0] + - - [1024, 3554, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11250.0] + - - [4096, 3271, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11832.0] + - - [1024, 3063, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11650.0] + - - [1024, 3209, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10308.0] + - - [4096, 3503, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11822.0] + - - [4096, 3344, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11587.0] + - - [1024, 3147, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 11263.0] + - - [1024, 3322, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10495.0] + - - [1024, 3341, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10728.0] + - - [1024, 3516, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11254.0] + - - [1024, 3454, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11048.0] + - - [4096, 3969, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11705.0] + - - [4096, 3466, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11703.0] + - - [1024, 3999, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11323.0] + - - [1024, 4032, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11451.0] + - - [1024, 3403, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10881.0] + - - [4096, 3361, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11635.0] + - - [1024, 3527, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11264.0] + - - [1024, 3822, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 12151.0] + - - [4096, 3315, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11987.0] + - - [232, 232, 272, 64, 232, 232, 64, 64] + - [19, 6643.0] + - - [1024, 3336, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10692.0] + - - [228, 232, 272, 64, 228, 228, 64, 64] + - [37, 6394.0] + - - [4096, 3547, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11959.0] + - - [4096, 3340, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11596.0] + - - [1024, 3906, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11082.0] + - - [1024, 3295, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10630.0] + - - [4096, 3294, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10219.0] + - - [33708, 3968, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 10909.0] + - - [1024, 3473, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 8712.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10754.0] + - - [4096, 3189, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10324.0] + - - [4096, 3494, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10132.0] + - - [1024, 3522, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 8806.0] + - - [33708, 3944, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 10803.0] + - - [135, 135, 480, 64, 135, 135, 64, 64] + - [16, 3384.0] + - - [4096, 3421, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10304.0] + - - [4096, 3311, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10519.0] + - - [1024, 3990, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10645.0] + - - [1024, 3290, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9175.0] + - - [4096, 3565, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10065.0] + - - [1024, 3484, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 8923.0] + - - [4096, 3384, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10096.0] + - - [1024, 3422, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9307.0] + - - [4096, 3681, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9923.0] + - - [1024, 3584, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 10593.0] + - - [4096, 4050, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8379.0] + - - [1024, 3996, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 7961.0] + - - [4096, 3169, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10353.0] + - - [4096, 3538, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 9580.0] + - - [1024, 3495, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 8777.0] + - - [4096, 3401, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10233.0] + - - [1024, 3560, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 8887.0] + - - [133, 135, 480, 64, 133, 133, 64, 64] + - [34, 3625.0] + - - [1024, 3263, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 9387.0] + - - [1024, 3870, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8292.0] + - - [4096, 3555, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 9595.0] + - - [4096, 3412, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10138.0] + - - [1024, 3296, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 9207.0] + - - [1024, 3379, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 9379.0] + - - [4096, 3302, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10179.0] + - - [1024, 3490, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 8682.0] + - - [1024, 3428, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 9104.0] + - - [1024, 3976, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 8115.0] + - - [4096, 3485, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10127.0] + - - [4096, 3534, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10077.0] + - - [1024, 3064, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 10325.0] + - - [4096, 3216, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10136.0] + - - [1024, 3450, 1, 4096, 1024, 1024, 4096, 4096] + - [15, 9597.0] + - - [1024, 3533, 1, 4096, 1024, 1024, 4096, 4096] + - [29, 8719.0] + - - [1024, 4030, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10603.0] + - - [1024, 3311, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9232.0] + - - [1024, 3468, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8911.0] + - - [4096, 3359, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10161.0] + - - [4096, 3392, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10265.0] + - - [1024, 3925, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10508.0] + - - [4096, 3233, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10147.0] + - - [4096, 3956, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9022.0] + - - [1024, 3463, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 8978.0] + - - [1024, 3126, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 9524.0] + - - [1024, 3363, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 9394.0] + - - [4096, 3465, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10140.0] + - - [33708, 3996, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 10792.0] + - - [1024, 3231, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9549.0] + - - [33708, 3978, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 10827.0] + - - [4096, 3476, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10134.0] + - - [4096, 3339, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10096.0] + - - [4096, 3452, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10428.0] + - - [1024, 3396, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9210.0] + - - [4096, 3293, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10156.0] + - - [1024, 3432, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9139.0] + - - [4096, 3493, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9985.0] + - - [4096, 3350, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10153.0] + - - [1024, 3079, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 9244.0] + - - [1024, 3101, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9484.0] + - - [33708, 3939, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12085.0] + - - [4096, 3256, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11785.0] + - - [1024, 3439, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11001.0] + - - [1024, 3510, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11222.0] + - - [4096, 3900, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11878.0] + - - [1024, 3470, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11098.0] + - - [4096, 3456, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11931.0] + - - [4096, 3014, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11737.0] + - - [4096, 3367, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11678.0] + - - [4096, 3432, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11880.0] + - - [33708, 4026, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12132.0] + - - [4096, 3273, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11855.0] + - - [4096, 3130, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11858.0] + - - [1024, 3496, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11112.0] + - - [1024, 3995, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11006.0] + - - [1024, 3939, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10843.0] + - - [1024, 3121, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10634.0] + - - [1024, 3232, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10255.0] + - - [4096, 3147, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11909.0] + - - [4096, 3516, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11891.0] + - - [1024, 3969, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11203.0] + - - [1024, 3364, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10771.0] + - - [4096, 3411, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11829.0] + - - [147, 147, 432, 64, 147, 147, 64, 64] + - [16, 4660.0] + - - [4096, 3301, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11952.0] + - - [1024, 3513, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11234.0] + - - [1024, 3469, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11100.0] + - - [1024, 3095, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10987.0] + - - [4096, 3533, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11904.0] + - - [4096, 3390, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11735.0] + - - [4096, 3582, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12054.0] + - - [1024, 3956, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11221.0] + - - [4096, 3585, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11631.0] + - - [4096, 3231, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11709.0] + - - [1024, 3205, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10602.0] + - - [4096, 3496, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11785.0] + - - [1024, 3143, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10933.0] + - - [1024, 3318, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 10566.0] + - - [1024, 3353, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10739.0] + - - [1024, 3464, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11079.0] + - - [4096, 2736, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11574.0] + - - [1024, 3402, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10902.0] + - - [4096, 3138, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11877.0] + - - [1024, 3860, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10598.0] + - - [148, 148, 432, 64, 148, 148, 64, 64] + - [16, 4710.0] + - - [1024, 3539, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11310.0] + - - [4096, 3211, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11647.0] + - - [1024, 3332, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10691.0] + - - [1024, 3466, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11102.0] + - - [4096, 3475, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11723.0] + - - [4096, 3524, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11886.0] + - - [4096, 2985, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11632.0] + - - [4096, 3222, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11661.0] + - - [4096, 3451, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11933.0] + - - [1024, 3181, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10886.0] + - - [1024, 3640, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11550.0] + - - [1024, 3375, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10878.0] + - - [1024, 3550, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11355.0] + - - [1024, 4020, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11338.0] + - - [4096, 3349, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11618.0] + - - [4096, 3398, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11753.0] + - - [33708, 3976, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 11983.0] + - - [1024, 2917, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11133.0] + - - [33708, 3910, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12042.0] + - - [4096, 3860, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11747.0] + - - [4096, 3304, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11941.0] + - - [1024, 3286, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10534.0] + - - [1024, 3460, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11073.0] + - - [1024, 4026, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11081.0] + - - [4096, 3471, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11730.0] + - - [193, 193, 320, 64, 193, 193, 64, 64] + - [34, 5326.0] + - - [1024, 3894, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11094.0] + - - [1024, 3506, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11211.0] + - - [1024, 4000, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11270.0] + - - [1024, 3900, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10740.0] + - - [1024, 3445, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11032.0] + - - [4096, 3442, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11929.0] + - - [1024, 3358, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10769.0] + - - [1024, 3211, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10376.0] + - - [4096, 3515, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11773.0] + - - [1024, 3564, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11320.0] + - - [4096, 3057, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11862.0] + - - [1024, 3343, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10690.0] + - - [4096, 3262, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11791.0] + - - [1024, 3518, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11256.0] + - - [33708, 3876, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 11966.0] + - - [4096, 3462, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11700.0] + - - [1024, 3265, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10381.0] + - - [4096, 3389, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11755.0] + - - [4096, 3438, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11903.0] + - - [1024, 3955, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11198.0] + - - [1024, 3545, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11325.0] + - - [1024, 3144, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10894.0] + - - [1024, 3417, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10938.0] + - - [4096, 3543, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11919.0] + - - [4096, 3352, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11628.0] + - - [33708, 3975, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 11986.0] + - - [148, 147, 432, 64, 148, 148, 64, 64] + - [16, 4665.0] + - - [4096, 3137, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11868.0] + - - [4096, 3506, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11852.0] + - - [1024, 3975, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11243.0] + - - [1024, 3859, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10645.0] + - - [4096, 3369, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11684.0] + - - [1024, 3434, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11004.0] + - - [1024, 3292, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10372.0] + - - [4096, 3523, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11891.0] + - - [4096, 3380, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11725.0] + - - [1024, 3408, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10890.0] + - - [4096, 3221, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11666.0] + - - [4096, 3270, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11834.0] + - - [143, 143, 432, 64, 143, 143, 64, 64] + - [43, 4420.0] + - - [1024, 3303, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10404.0] + - - [4096, 3502, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11833.0] + - - [1024, 3222, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10367.0] + - - [4096, 2505, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11821.0] + - - [4096, 3397, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11785.0] + - - [4096, 3562, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12037.0] + - - [4096, 3095, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11728.0] + - - [1024, 3226, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10367.0] + - - [177, 177, 352, 64, 177, 177, 64, 64] + - [16, 5807.0] + - - [4096, 3360, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11660.0] + - - [1024, 3942, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11163.0] + - - [1024, 3298, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10819.0] + - - [1024, 3381, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10783.0] + - - [4096, 3314, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11970.0] + - - [1024, 3492, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11135.0] + - - [1024, 3430, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10983.0] + - - [4096, 3977, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11683.0] + - - [4096, 3546, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11962.0] + - - [4096, 3640, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11817.0] + - - [4096, 3441, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11896.0] + - - [33708, 4059, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12233.0] + - - [1024, 3978, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11196.0] + - - [1024, 3376, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10829.0] + - - [1024, 3482, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 11033.0] + - - [1024, 3563, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11362.0] + - - [4096, 4020, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11817.0] + - - [1024, 3271, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10620.0] + - - [1024, 3291, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10315.0] + - - [1024, 3431, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10982.0] + - - [1024, 3481, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11133.0] + - - [4096, 3461, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11693.0] + - - [1024, 3574, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11418.0] + - - [1024, 4059, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11481.0] + - - [1024, 3421, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10957.0] + - - [4096, 3224, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11672.0] + - - [4096, 3437, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11912.0] + - - [4096, 3168, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11957.0] + - - [33708, 3990, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12023.0] + - - [1024, 3349, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10687.0] + - - [4096, 3335, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11575.0] + - - [4096, 3400, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11780.0] + - - [160, 159, 400, 64, 160, 160, 64, 64] + - [34, 5453.0] + - - [1024, 3398, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10825.0] + - - [1024, 3780, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 12029.0] + - - [4096, 3098, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11694.0] + - - [1024, 4012, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11030.0] + - - [4096, 3505, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11803.0] + - - [4096, 3554, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11957.0] + - - [4096, 3063, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11920.0] + - - [1024, 3503, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11159.0] + - - [1024, 3166, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 11318.0] + - - [1024, 3425, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10967.0] + - - [1024, 3344, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10786.0] + - - [4096, 3484, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11725.0] + - - [1024, 3681, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11266.0] + - - [1024, 4050, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11487.0] + - - [4096, 3379, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11736.0] + - - [4096, 3428, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11891.0] + - - [1024, 3304, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10554.0] + - - [1024, 3387, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10846.0] + - - [4096, 3126, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11796.0] + - - [1024, 3498, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11182.0] + - - [1024, 3436, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11017.0] + - - [4096, 3501, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11832.0] + - - [4096, 3358, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11639.0] + - - [4096, 3232, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11709.0] + - - [1024, 3585, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11427.0] + - - [4096, 3143, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11852.0] + - - [4096, 3464, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11670.0] + - - [1024, 3366, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10773.0] + - - [4096, 3375, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11695.0] + - - [4096, 2917, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11909.0] + - - [4096, 4026, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11825.0] + - - [1024, 3277, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10232.0] + - - [1024, 3103, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10461.0] + - - [33708, 3995, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12040.0] + - - [1024, 3297, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10529.0] + - - [4096, 3545, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11932.0] + - - [1024, 3399, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10884.0] + - - [33708, 3796, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 12201.0] + - - [4096, 3292, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11904.0] + - - [33708, 3859, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 11921.0] + - - [4096, 3566, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12031.0] + - - [4096, 3894, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11863.0] + - - [4096, 3492, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11787.0] + - - [1024, 3977, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11216.0] + - - [1024, 3272, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10737.0] + - - [135, 134, 480, 64, 135, 135, 64, 64] + - [34, 3983.0] + - - [1024, 3355, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10774.0] + - - [4096, 3419, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11852.0] + - - [1024, 3404, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10881.0] + - - [4096, 3999, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11748.0] + - - [4096, 3166, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11989.0] + - - [33708, 3840, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12357.0] + - - [4096, 4032, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11816.0] + - - [1024, 3573, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 11191.0] + - - [4096, 3366, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11654.0] + - - [1024, 3541, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11258.0] + - - [4096, 3207, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11609.0] + - - [4096, 3272, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11835.0] + - - [1024, 3334, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10687.0] + - - [228, 228, 272, 64, 228, 228, 64, 64] + - [3, 6415.0] + - - [4096, 3183, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12005.0] + - - [4096, 3536, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11936.0] + - - [1024, 4005, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11299.0] + - - [1024, 3245, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10397.0] + - - [4096, 3447, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11941.0] + - - [1024, 3183, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10832.0] + - - [1024, 3361, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10766.0] + - - [33708, 3870, 1, 1024, 33708, 33708, 1024, 1024] + - [28, 11911.0] + - - [1024, 3321, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10618.0] + - - [1024, 3486, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11148.0] + - - [4096, 4005, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11772.0] + - - [4096, 3410, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11837.0] + - - [1024, 3944, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11166.0] + - - [4096, 3300, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11939.0] + - - [4096, 3579, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12031.0] + - - [4096, 3483, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11756.0] + - - [4096, 3532, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11909.0] + - - [1024, 3140, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10788.0] + - - [1024, 3372, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10753.0] + - - [1024, 3224, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10375.0] + - - [4096, 3230, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11702.0] + - - [4096, 3427, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11887.0] + - - [1024, 3796, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11498.0] + - - [143, 148, 432, 64, 143, 143, 64, 64] + - [7, 4564.0] + - - [1024, 3616, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11402.0] + - - [1024, 3315, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10589.0] + - - [1024, 3476, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11133.0] + - - [1024, 3509, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11240.0] + - - [4096, 3357, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11624.0] + - - [4096, 3406, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11814.0] + - - [1024, 3558, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11357.0] + - - [4096, 3593, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11660.0] + - - [4096, 3247, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11745.0] + - - [4096, 3088, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11704.0] + - - [1024, 3213, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10031.0] + - - [4096, 3511, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11854.0] + - - [1024, 3365, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10915.0] + - - [1024, 3504, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11224.0] + - - [1024, 3442, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11050.0] + - - [4096, 3474, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11717.0] + - - [4096, 2984, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11637.0] + - - [1024, 3876, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10656.0] + - - [4096, 3337, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11575.0] + - - [4096, 3450, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11941.0] + - - [1024, 3547, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11276.0] + - - [4096, 3291, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11912.0] + - - [1024, 3340, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10832.0] + - - [4096, 3491, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11758.0] + - - [4096, 3348, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11609.0] + - - [4096, 3906, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11854.0] + - - [1024, 3477, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11087.0] + - - [1024, 3397, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10883.0] + - - [4096, 3165, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11986.0] + - - [4096, 3470, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11722.0] + - - [1024, 3526, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11263.0] + - - [4096, 3365, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11679.0] + - - [4096, 3319, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12000.0] + - - [1024, 3401, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10887.0] + - - [1024, 3294, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10691.0] + - - [159, 159, 400, 64, 159, 159, 64, 64] + - [34, 5322.0] + - - [1024, 3472, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11129.0] + - - [4096, 3328, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12047.0] + - - [1024, 3861, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10950.0] + - - [1024, 3910, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11125.0] + - - [1024, 3410, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10909.0] + - - [1024, 3395, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10837.0] + - - [4096, 3282, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11888.0] + - - [1024, 3751, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11460.0] + - - [4096, 3145, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11882.0] + - - [4096, 3514, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11865.0] + - - [4096, 3944, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12011.0] + - - [1024, 3515, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11230.0] + - - [4096, 3409, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11821.0] + - - [4096, 3564, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12035.0] + - - [4096, 3299, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11925.0] + - - [1024, 3057, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11636.0] + - - [4096, 3531, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11925.0] + - - [4096, 3388, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11740.0] + - - [1024, 3189, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10632.0] + - - [1024, 3300, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10679.0] + - - [1024, 3720, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11822.0] + - - [1024, 3383, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10802.0] + - - [1024, 3494, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11189.0] + - - [1024, 3448, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11050.0] + - - [4096, 3542, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11934.0] + - - [1024, 3488, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 11031.0] + - - [4096, 3405, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11803.0] + - - [1024, 3262, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10298.0] + - - [33708, 4005, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12063.0] + - - [1024, 3594, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11462.0] + - - [4096, 3103, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11741.0] + - - [4096, 3136, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11858.0] + - - [1024, 3378, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10820.0] + - - [4096, 3559, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12001.0] + - - [4096, 3368, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11679.0] + - - [4096, 3209, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11641.0] + - - [4096, 3322, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 12032.0] + - - [1024, 3483, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11088.0] + - - [4096, 3473, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11714.0] + - - [4096, 3522, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11901.0] + - - [1024, 3532, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11271.0] + - - [4096, 3449, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11939.0] + - - [1024, 3351, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10640.0] + - - [1024, 3462, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11084.0] + - - [4096, 3396, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11771.0] + - - [132, 132, 480, 64, 132, 132, 64, 64] + - [24, 3838.0] + - - [1024, 3416, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10960.0] + - - [4096, 3469, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11708.0] + - - [1024, 3582, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 11295.0] + - - [1024, 3230, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10382.0] + - - [1024, 3489, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11149.0] + - - [1024, 3427, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10957.0] + - - [1024, 3346, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10712.0] + - - [33708, 3977, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 11989.0] + - - [4096, 3796, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 12032.0] + - - [4096, 3176, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12008.0] + - - [4096, 3990, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11723.0] + - - [1024, 3257, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9378.0] + - - [4096, 3343, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10038.0] + - - [4096, 3440, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10266.0] + - - [33708, 4030, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 10957.0] + - - [1024, 3190, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 9434.0] + - - [1024, 3389, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9175.0] + - - [1024, 3500, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 9066.0] + - - [1024, 3471, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 8870.0] + - - [1024, 3438, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9293.0] + - - [4096, 3513, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 9669.0] + - - [1024, 3562, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8701.0] + - - [4096, 3616, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 9259.0] + - - [4096, 3955, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8466.0] + - - [1024, 3441, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9269.0] + - - [1024, 3236, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 9643.0] + - - [1024, 3524, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8686.0] + - - [4096, 3460, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10050.0] + - - [1024, 3384, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9152.0] + - - [4096, 3387, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10272.0] + - - [4096, 3436, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10349.0] + - - [4096, 3277, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11152.0] + - - [1024, 3457, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 8850.0] + - - [1024, 3999, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8043.0] + - - [1024, 4032, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 9291.0] + - - [4096, 3541, 1, 1024, 4096, 4096, 1024, 1024] + - [20, 10835.0] + - - [4096, 3334, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10059.0] + - - [1024, 3393, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9170.0] + - - [1024, 3411, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9217.0] + - - [1024, 3822, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 10866.0] + - - [1024, 3593, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8700.0] + - - [33708, 3822, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 10876.0] + - - [4096, 3504, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 9508.0] + - - [1024, 3163, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 9300.0] + - - [1024, 3357, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9396.0] + - - [1024, 3906, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8162.0] + - - [4096, 3415, 1, 1024, 4096, 4096, 1024, 1024] + - [13, 9620.0] + - - [1024, 3406, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9130.0] + - - [4096, 3321, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10338.0] + - - [4096, 3584, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 9667.0] + - - [1024, 2736, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10222.0] + - - [1024, 3110, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9221.0] + - - [33708, 3999, 1, 1024, 33708, 33708, 1024, 1024] + - [49, 10768.0] + - - [1024, 3093, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 9131.0] + - - [4096, 3378, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10126.0] + - - [1024, 3543, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8704.0] + - - [33708, 3925, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 10749.0] + - - [1024, 3352, 1, 4096, 1024, 1024, 4096, 4096] + - [29, 9207.0] + - - [4096, 3780, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8985.0] + - - [1024, 3990, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8048.0] + - - [4096, 3500, 1, 1024, 4096, 4096, 1024, 1024] + - [38, 10717.0] + - - [4096, 3996, 1, 1024, 4096, 4096, 1024, 1024] + - [10, 9348.0] + - - [1024, 3247, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 9146.0] + - - [4096, 3395, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 9505.0] + - - [1024, 3169, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 9450.0] + - - [1024, 3088, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 9274.0] + - - [1024, 3584, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 8707.0] + - - [4096, 3093, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10433.0] + - - [1024, 3538, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 8769.0] + - - [1024, 3996, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10617.0] + - - [1024, 3581, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 8653.0] + - - [4096, 3374, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10114.0] + - - [33708, 3751, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12073.0] + - - [4096, 3215, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11643.0] + - - [4096, 3312, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11981.0] + - - [4096, 3581, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12077.0] + - - [4096, 3479, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11763.0] + - - [4096, 3544, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11967.0] + - - [1024, 3870, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10968.0] + - - [1024, 3374, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10808.0] + - - [1024, 2967, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11332.0] + - - [4096, 3455, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11982.0] + - - [4096, 3942, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11985.0] + - - [1024, 3528, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10467.0] + - - [4096, 3186, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11761.0] + - - [1024, 3976, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11209.0] + - - [1024, 3511, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 11111.0] + - - [4096, 3573, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12034.0] + - - [4096, 3561, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12015.0] + - - [4096, 3418, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11804.0] + - - [33708, 3906, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12057.0] + - - [4096, 3259, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11806.0] + - - [4096, 3308, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11961.0] + - - [1024, 3419, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10943.0] + - - [1024, 3215, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10485.0] + - - [1024, 4030, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11101.0] + - - [4096, 3459, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11693.0] + - - [1024, 3572, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11400.0] + - - [1024, 3137, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 10697.0] + - - [1024, 3312, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10569.0] + - - [1024, 3925, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10799.0] + - - [1024, 3453, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11068.0] + - - [4096, 3435, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11909.0] + - - [1024, 3176, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 10595.0] + - - [1024, 3444, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11042.0] + - - [4096, 3975, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11725.0] + - - [4096, 3182, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12051.0] + - - [1024, 3475, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11065.0] + - - [33708, 3955, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12189.0] + - - [4096, 3446, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11927.0] + - - [1024, 3138, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10592.0] + - - [1024, 3549, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11353.0] + - - [4096, 3287, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11921.0] + - - [1024, 3342, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10890.0] + - - [4096, 3519, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11897.0] + - - [4096, 3552, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12004.0] + - - [4096, 3859, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11767.0] + - - [33708, 3969, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 11979.0] + - - [1024, 3369, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10808.0] + - - [4096, 3482, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11782.0] + - - [1024, 3306, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10774.0] + - - [1024, 3474, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11127.0] + - - [4096, 3377, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11704.0] + - - [4096, 3426, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11877.0] + - - [4096, 2935, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12009.0] + - - [4096, 3267, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11852.0] + - - [1024, 3299, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10490.0] + - - [1024, 3456, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11072.0] + - - [1024, 3280, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10835.0] + - - [1024, 3555, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11363.0] + - - [4096, 3499, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11842.0] + - - [4096, 3356, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11652.0] + - - [1024, 3412, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10935.0] + - - [1024, 2984, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11421.0] + - - [4096, 3141, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11908.0] + - - [4096, 3510, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11875.0] + - - [1024, 3995, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11335.0] + - - [1024, 3517, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11222.0] + - - [1024, 3455, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11069.0] + - - [1024, 3939, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11192.0] + - - [1024, 3447, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11054.0] + - - [1024, 3969, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10962.0] + - - [4096, 3527, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11907.0] + - - [4096, 3336, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11584.0] + - - [1024, 3191, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 11093.0] + - - [1024, 3302, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10704.0] + - - [1024, 3337, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10730.0] + - - [4096, 3290, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11910.0] + - - [1024, 3512, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11253.0] + - - [1024, 3433, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10992.0] + - - [4096, 3876, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11814.0] + - - [4096, 3490, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11807.0] + - - [4096, 3064, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11934.0] + - - [1024, 3508, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11222.0] + - - [1024, 3956, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10925.0] + - - [4096, 3417, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11839.0] + - - [1024, 3248, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10334.0] + - - [1024, 2499, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11847.0] + - - [1024, 3186, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 11103.0] + - - [1024, 3180, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 11012.0] + - - [4096, 3364, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11662.0] + - - [4096, 3976, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11689.0] + - - [4096, 3205, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11623.0] + - - [4096, 3318, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 12010.0] + - - [1024, 3377, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10914.0] + - - [1024, 3485, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11145.0] + - - [4096, 3181, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12024.0] + - - [4096, 3550, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11985.0] + - - [1024, 3534, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11178.0] + - - [1024, 3860, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10956.0] + - - [160, 160, 400, 64, 160, 160, 64, 64] + - [16, 5535.0] + - - [4096, 3445, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11947.0] + - - [1024, 3391, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10868.0] + - - [1024, 3221, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10535.0] + - - [4096, 3079, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11649.0] + - - [4096, 3144, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11891.0] + - - [1024, 3270, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10544.0] + - - [1024, 3561, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11348.0] + - - [1024, 3480, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11132.0] + - - [4096, 3408, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11817.0] + - - [1024, 3418, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10990.0] + - - [4096, 3298, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11936.0] + - - [1024, 3640, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11132.0] + - - [1024, 3449, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11058.0] + - - [1024, 4020, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11043.0] + - - [4096, 3481, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11768.0] + - - [4096, 3530, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11928.0] + - - [1024, 3216, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10035.0] + - - [1024, 3491, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11172.0] + - - [1024, 3154, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10906.0] + - - [4096, 3425, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11866.0] + - - [1024, 3348, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10788.0] + - - [1024, 3415, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10935.0] + - - [1024, 4026, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11372.0] + - - [1024, 3367, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10801.0] + - - [1024, 3259, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10329.0] + - - [1024, 3894, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10713.0] + - - [4096, 3355, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11626.0] + - - [4096, 3404, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11805.0] + - - [1024, 3308, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10539.0] + - - [4096, 3245, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11761.0] + - - [1024, 3502, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11146.0] + - - [33708, 4032, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12154.0] + - - [1024, 3424, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10937.0] + - - [4096, 3509, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11861.0] + - - [4096, 3558, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12030.0] + - - [1024, 3900, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11126.0] + - - [1024, 2505, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11856.0] + - - [4096, 3472, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11749.0] + - - [1024, 3386, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10832.0] + - - [4096, 3383, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11727.0] + - - [4096, 3448, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11946.0] + - - [4096, 4030, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11844.0] + - - [4096, 3289, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11923.0] + - - [1024, 3459, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11072.0] + - - [1024, 2918, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11167.0] + - - [4096, 3489, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11800.0] + - - [4096, 3346, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11607.0] + - - [4096, 3572, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12071.0] + - - [1024, 3955, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10856.0] + - - [4096, 3236, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11724.0] + - - [4096, 3163, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11989.0] + - - [4096, 3468, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11734.0] + - - [1024, 3165, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10746.0] + - - [1024, 3276, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10631.0] + - - [1024, 3359, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10784.0] + - - [4096, 3363, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11645.0] + - - [1024, 3385, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10843.0] + - - [1024, 3207, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10187.0] + - - [1024, 3458, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11091.0] + - - [4096, 3110, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11781.0] + - - [4096, 3925, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11960.0] + - - [1024, 3975, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10926.0] + - - [4096, 3549, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11986.0] + - - [4096, 3342, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11582.0] + - - [1024, 3859, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10963.0] + - - [1024, 3497, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11192.0] + - - [4096, 3280, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11881.0] + - - [1024, 3435, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10999.0] + - - [1024, 3354, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10765.0] + - - [4096, 3191, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12106.0] + - - [4096, 3512, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11884.0] + - - [1024, 3055, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11649.0] + - - [4096, 2499, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11847.0] + - - [1024, 3233, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10576.0] + - - [4096, 3423, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11874.0] + - - [1024, 3319, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10663.0] + - - [4096, 3297, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11927.0] + - - [4096, 3154, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11953.0] + - - [1024, 3540, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11313.0] + - - [1024, 3289, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10490.0] + - - [4096, 3529, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11932.0] + - - [4096, 3386, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11754.0] + - - [4096, 3276, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11868.0] + - - [1024, 3244, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10382.0] + - - [1024, 3182, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 11056.0] + - - [4096, 3540, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11969.0] + - - [1024, 3360, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10767.0] + - - [1024, 3942, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10817.0] + - - [4096, 3403, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11808.0] + - - [4096, 3101, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11764.0] + - - [4096, 2918, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11964.0] + - - [1024, 3465, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11099.0] + - - [33708, 3780, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12156.0] + - - [4096, 3557, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12008.0] + - - [4096, 3414, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11835.0] + - - [1024, 3948, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11210.0] + - - [4096, 3320, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 12021.0] + - - [4096, 2765, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11691.0] + - - [1024, 3978, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10893.0] + - - [4096, 3487, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11776.0] + - - [4096, 3520, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11900.0] + - - [1024, 3139, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10671.0] + - - [1024, 3314, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10678.0] + - - [4096, 3431, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11900.0] + - - [1024, 3446, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11031.0] + - - [1024, 4059, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11118.0] + - - [4096, 3345, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11610.0] + - - [4096, 3394, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11785.0] + - - [1024, 3927, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11160.0] + - - [4096, 3235, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11724.0] + - - [1024, 3328, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10949.0] + - - [33708, 3956, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12213.0] + - - [4096, 3467, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11714.0] + - - [1024, 3287, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10607.0] + - - [4096, 3214, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11653.0] + - - [4096, 3910, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11909.0] + - - [1024, 3780, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11476.0] + - - [1024, 3371, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10866.0] + - - [4096, 3478, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11748.0] + - - [1024, 3546, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11323.0] + - - [1024, 4012, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11298.0] + - - [4096, 3341, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11594.0] + - - [4096, 3454, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11971.0] + - - [4096, 3295, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11929.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12000.0] + - - [1024, 3282, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10529.0] + - - [33708, 3720, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 11964.0] + - - [1024, 3681, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11691.0] + - - [1024, 4050, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11126.0] + - - [4096, 3495, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11715.0] + - - [4096, 3560, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11890.0] + - - [4096, 3751, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11899.0] + - - [1024, 3414, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10920.0] + - - [33708, 3860, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 11815.0] + - - [1024, 3325, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10490.0] + - - [4096, 3458, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11588.0] + - - [4096, 2967, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11522.0] + - - [1024, 3519, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11263.0] + - - [4096, 3385, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11736.0] + - - [4096, 3434, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11905.0] + - - [1024, 3552, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11343.0] + - - [4096, 3822, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 12104.0] + - - [1024, 3544, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11266.0] + - - [4096, 3539, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11868.0] + - - [4096, 3332, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11571.0] + - - [1024, 3145, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10797.0] + - - [1024, 3535, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11285.0] + - - [1024, 3320, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10798.0] + - - [33708, 4012, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12101.0] + - - [4096, 3286, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11866.0] + - - [1024, 3514, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11213.0] + - - [1024, 2765, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10716.0] + - - [1024, 3452, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11057.0] + - - [4096, 3518, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11902.0] + - - [1024, 3529, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11277.0] + - - [4096, 3413, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11838.0] + - - [33708, 4050, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12193.0] + - - [1024, 3525, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11249.0] + - - [4096, 3303, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11947.0] + - - [1024, 3382, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10826.0] + - - [1024, 3390, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10855.0] + - - [1024, 3977, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10858.0] + - - [1024, 3184, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10705.0] + - - [4096, 3535, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11939.0] + - - [4096, 3376, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11696.0] + - - [4096, 3978, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11683.0] + - - [1024, 3136, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10835.0] + - - [1024, 3293, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10568.0] + - - [4096, 3266, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11825.0] + - - [1024, 3487, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11158.0] + - - [1024, 3409, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10910.0] + - - [4096, 3498, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11825.0] + - - [1024, 3520, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11196.0] + - - [1024, 3530, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11157.0] + - - [4096, 3393, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11751.0] + - - [4096, 3140, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11901.0] + - - [1024, 3536, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11269.0] + - - [1024, 3288, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10572.0] + - - [1024, 4005, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11023.0] + - - [1024, 3579, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11397.0] + - - [4096, 3372, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11695.0] + - - [1024, 3440, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11011.0] + - - [4096, 3213, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11651.0] + - - [4096, 3477, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11752.0] + - - [4096, 3526, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11911.0] + - - [1024, 3493, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11175.0] + - - [1024, 3944, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10878.0] + - - [4096, 3453, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11949.0] + - - [1024, 3350, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10752.0] + - - [4096, 3184, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12068.0] + - - [1024, 3423, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10964.0] + - - [4096, 3351, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11606.0] + - - [4096, 3416, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11829.0] + - - [1024, 3796, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11970.0] + - - [4096, 3257, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11793.0] + - - [4096, 3306, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11963.0] + - - [33708, 4020, 1, 1024, 33708, 33708, 1024, 1024] + - [30, 12121.0] + - - [1024, 3426, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10959.0] + - - [4096, 3457, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11668.0] + - - [1024, 2935, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11199.0] + - - [1024, 3046, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11636.0] + - - [4096, 3433, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11891.0] + - - [1024, 3256, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10295.0] + - - [1024, 3531, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11139.0] + - - [4096, 3180, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12013.0] + - - [1024, 3388, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10837.0] + - - [4096, 3444, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11936.0] + - - [1024, 3501, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11169.0] + - - [1024, 3266, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10692.0] + - - [1024, 3267, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 10431.0] + - - [1024, 3461, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11069.0] + - - [4096, 3870, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11786.0] + - - [4096, 3517, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11887.0] + - - [1024, 3566, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11333.0] + - - [4096, 3574, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12040.0] + - - [1024, 3876, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11045.0] + - - [4096, 3720, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11783.0] + - - [4096, 3248, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11765.0] + - - [4096, 4059, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11915.0] + - - [1024, 3380, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10818.0] + - - [4096, 3480, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11765.0] + - - [1024, 3335, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10700.0] + - - [1024, 3345, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10743.0] + - - [4096, 3391, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11761.0] + - - [4096, 3424, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11880.0] + - - [1024, 3394, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10857.0] + - - [4096, 3265, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11822.0] + - - [1024, 3014, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11520.0] + - - [4096, 3497, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11814.0] + - - [4096, 3354, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11626.0] + - - [4096, 3055, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11911.0] + - - [1024, 3499, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11164.0] + - - [1024, 3162, 1, 4096, 1024, 1024, 4096, 4096] + - [13, 10539.0] + - - [4096, 3244, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11748.0] + - - [1024, 3437, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 11064.0] + - - [1024, 3356, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10762.0] + - - [4096, 3139, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11867.0] + - - [4096, 3508, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11856.0] + - - [1024, 3235, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10580.0] + - - [1024, 3910, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10697.0] + - - [4096, 3371, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11694.0] + - - [1024, 3751, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11832.0] + - - [4096, 3325, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12015.0] + - - [1024, 3413, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10924.0] + - - [1024, 3542, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11256.0] + - - [33708, 3900, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12042.0] + - - [4096, 3525, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11895.0] + - - [4096, 3382, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11724.0] + - - [1024, 3339, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10704.0] + - - [4096, 3288, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11894.0] + - - [1024, 3141, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10716.0] + - - [1024, 3168, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10812.0] + - - [4096, 3488, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11778.0] + - - [4096, 3046, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11880.0] + - - [1024, 3362, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 10768.0] + - - [33708, 3942, 1, 1024, 33708, 33708, 1024, 1024] + - [47, 12159.0] + - - [4096, 3399, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11780.0] + - - [1024, 3720, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11321.0] + - - [4096, 3563, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 12005.0] + - - [1024, 3273, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10365.0] + - - [4096, 3162, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11968.0] + - - [1024, 3467, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 11061.0] + - - [1024, 3130, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 10763.0] + - - [1024, 3405, 1, 4096, 1024, 1024, 4096, 4096] + - [14, 10899.0] + - - [4096, 3362, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11636.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11207.0] + - - [1024, 3712, 1, 36548, 1024, 1024, 36548, 36548] + - [22, 11982.0] + - - [1024, 3712, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 11403.0] + - - [4032, 384, 1, 64, 4032, 4032, 64, 64] + - [2, 8230.0] + - - [1024, 2048, 1, 49, 1024, 1024, 49, 49] + - [0, 7126.0] + - - [4608, 512, 1, 49, 4608, 4608, 49, 49] + - [18, 7697.0] + - - [9216, 512, 1, 4096, 9216, 9216, 4096, 4096] + - [49, 10648.0] + - - [3456, 384, 1, 289, 3456, 3456, 289, 289] + - [34, 9296.0] + - - [3456, 384, 1, 169, 3456, 3456, 169, 169] + - [34, 9066.0] + - - [4096, 512, 1, 1001, 4096, 4096, 1001, 1001] + - [35, 10904.0] + - - [384, 448, 49, 512, 384, 384, 512, 512] + - [47, 10156.0] + - - [384, 448, 64, 256, 384, 384, 256, 256] + - [28, 9972.0] + - - [384, 448, 36, 256, 384, 384, 256, 256] + - [38, 9794.0] + - - [384, 448, 49, 256, 384, 384, 256, 256] + - [12, 9942.0] + - - [384, 448, 64, 512, 384, 384, 512, 512] + - [28, 10228.0] + - - [384, 448, 36, 512, 384, 384, 512, 512] + - [47, 10151.0] + - - [1024, 6400, 1, 65, 1024, 1024, 65, 65] + - [0, 9437.0] + - - [4096, 6400, 1, 256, 4096, 4096, 256, 256] + - [12, 11978.0] + - - [512, 3194, 1, 2048, 512, 512, 2048, 2048] + - [47, 11107.0] + - - [512, 3222, 1, 2048, 512, 512, 2048, 2048] + - [49, 10015.0] + - - [512, 3234, 1, 2048, 512, 512, 2048, 2048] + - [49, 10054.0] + - - [512, 3242, 1, 2048, 512, 512, 2048, 2048] + - [30, 10112.0] + - - [512, 3257, 1, 2048, 512, 512, 2048, 2048] + - [30, 10131.0] + - - [512, 3332, 1, 2048, 512, 512, 2048, 2048] + - [14, 10355.0] + - - [512, 3336, 1, 2048, 512, 512, 2048, 2048] + - [49, 10385.0] + - - [512, 3378, 1, 2048, 512, 512, 2048, 2048] + - [49, 10501.0] + - - [512, 3396, 1, 2048, 512, 512, 2048, 2048] + - [49, 10550.0] + - - [512, 3399, 1, 2048, 512, 512, 2048, 2048] + - [30, 10567.0] + - - [512, 3451, 1, 2048, 512, 512, 2048, 2048] + - [49, 10731.0] + - - [512, 3456, 1, 2048, 512, 512, 2048, 2048] + - [30, 10732.0] + - - [512, 3458, 1, 2048, 512, 512, 2048, 2048] + - [14, 10707.0] + - - [512, 3467, 1, 2048, 512, 512, 2048, 2048] + - [30, 10763.0] + - - [512, 3468, 1, 2048, 512, 512, 2048, 2048] + - [49, 10746.0] + - - [512, 3470, 1, 2048, 512, 512, 2048, 2048] + - [30, 10787.0] + - - [512, 3477, 1, 2048, 512, 512, 2048, 2048] + - [30, 10778.0] + - - [512, 3478, 1, 2048, 512, 512, 2048, 2048] + - [30, 10810.0] + - - [512, 3495, 1, 2048, 512, 512, 2048, 2048] + - [49, 10846.0] + - - [512, 3507, 1, 2048, 512, 512, 2048, 2048] + - [30, 10886.0] + - - [512, 3515, 1, 2048, 512, 512, 2048, 2048] + - [30, 10913.0] + - - [512, 3517, 1, 2048, 512, 512, 2048, 2048] + - [49, 10889.0] + - - [2048, 2864, 1, 512, 2048, 2048, 512, 512] + - [12, 11094.0] + - - [2048, 3287, 1, 512, 2048, 2048, 512, 512] + - [12, 11540.0] + - - [2048, 3412, 1, 512, 2048, 2048, 512, 512] + - [12, 11496.0] + - - [2048, 3456, 1, 512, 2048, 2048, 512, 512] + - [12, 11719.0] + - - [2048, 3466, 1, 512, 2048, 2048, 512, 512] + - [12, 11225.0] + - - [2048, 3476, 1, 512, 2048, 2048, 512, 512] + - [28, 11266.0] + - - [2048, 3999, 1, 512, 2048, 2048, 512, 512] + - [12, 11470.0] + - - [33708, 189, 1, 512, 33708, 33708, 512, 512] + - [27, 9636.0] + - - [33708, 2496, 1, 512, 33708, 33708, 512, 512] + - [12, 11886.0] + - - [33708, 3864, 1, 512, 33708, 33708, 512, 512] + - [12, 11906.0] + - - [33708, 3969, 1, 512, 33708, 33708, 512, 512] + - [6, 11871.0] + - - [33708, 3995, 1, 512, 33708, 33708, 512, 512] + - [14, 11922.0] + - - [134, 134, 240, 64, 134, 134, 64, 64] + - [7, 3800.0] + - - [135, 134, 240, 64, 135, 135, 64, 64] + - [24, 3894.0] + - - [135, 135, 240, 64, 135, 135, 64, 64] + - [34, 3935.0] + - - [512, 2790, 1, 2048, 512, 512, 2048, 2048] + - [28, 9959.0] + - - [512, 2864, 1, 2048, 512, 512, 2048, 2048] + - [12, 10210.0] + - - [512, 3092, 1, 2048, 512, 512, 2048, 2048] + - [12, 10772.0] + - - [512, 3113, 1, 2048, 512, 512, 2048, 2048] + - [12, 11036.0] + - - [512, 3137, 1, 2048, 512, 512, 2048, 2048] + - [28, 11111.0] + - - [512, 3165, 1, 2048, 512, 512, 2048, 2048] + - [12, 11213.0] + - - [512, 3166, 1, 2048, 512, 512, 2048, 2048] + - [47, 11214.0] + - - [512, 3219, 1, 2048, 512, 512, 2048, 2048] + - [49, 10009.0] + - - [512, 3237, 1, 2048, 512, 512, 2048, 2048] + - [49, 10056.0] + - - [512, 3246, 1, 2048, 512, 512, 2048, 2048] + - [30, 10080.0] + - - [512, 3249, 1, 2048, 512, 512, 2048, 2048] + - [49, 10132.0] + - - [512, 3251, 1, 2048, 512, 512, 2048, 2048] + - [49, 10131.0] + - - [512, 3262, 1, 2048, 512, 512, 2048, 2048] + - [49, 10173.0] + - - [512, 3268, 1, 2048, 512, 512, 2048, 2048] + - [49, 10178.0] + - - [512, 3282, 1, 2048, 512, 512, 2048, 2048] + - [49, 10203.0] + - - [512, 3286, 1, 2048, 512, 512, 2048, 2048] + - [49, 10200.0] + - - [512, 3287, 1, 2048, 512, 512, 2048, 2048] + - [30, 10231.0] + - - [512, 3293, 1, 2048, 512, 512, 2048, 2048] + - [49, 10242.0] + - - [512, 3297, 1, 2048, 512, 512, 2048, 2048] + - [49, 10237.0] + - - [512, 3307, 1, 2048, 512, 512, 2048, 2048] + - [30, 10274.0] + - - [512, 3314, 1, 2048, 512, 512, 2048, 2048] + - [49, 10279.0] + - - [512, 3315, 1, 2048, 512, 512, 2048, 2048] + - [49, 10285.0] + - - [512, 3319, 1, 2048, 512, 512, 2048, 2048] + - [49, 10308.0] + - - [512, 3322, 1, 2048, 512, 512, 2048, 2048] + - [14, 10308.0] + - - [512, 3323, 1, 2048, 512, 512, 2048, 2048] + - [14, 10333.0] + - - [512, 3324, 1, 2048, 512, 512, 2048, 2048] + - [30, 10308.0] + - - [512, 3325, 1, 2048, 512, 512, 2048, 2048] + - [14, 10329.0] + - - [512, 3327, 1, 2048, 512, 512, 2048, 2048] + - [30, 10321.0] + - - [512, 3329, 1, 2048, 512, 512, 2048, 2048] + - [49, 10370.0] + - - [512, 3339, 1, 2048, 512, 512, 2048, 2048] + - [49, 10400.0] + - - [512, 3342, 1, 2048, 512, 512, 2048, 2048] + - [30, 10438.0] + - - [512, 3344, 1, 2048, 512, 512, 2048, 2048] + - [49, 10399.0] + - - [512, 3358, 1, 2048, 512, 512, 2048, 2048] + - [30, 10456.0] + - - [512, 3360, 1, 2048, 512, 512, 2048, 2048] + - [30, 10466.0] + - - [512, 3364, 1, 2048, 512, 512, 2048, 2048] + - [30, 10478.0] + - - [512, 3365, 1, 2048, 512, 512, 2048, 2048] + - [30, 10465.0] + - - [512, 3369, 1, 2048, 512, 512, 2048, 2048] + - [49, 10483.0] + - - [512, 3371, 1, 2048, 512, 512, 2048, 2048] + - [30, 10494.0] + - - [512, 3374, 1, 2048, 512, 512, 2048, 2048] + - [30, 10506.0] + - - [512, 3376, 1, 2048, 512, 512, 2048, 2048] + - [14, 10486.0] + - - [512, 3377, 1, 2048, 512, 512, 2048, 2048] + - [49, 10498.0] + - - [512, 3381, 1, 2048, 512, 512, 2048, 2048] + - [49, 10503.0] + - - [512, 3382, 1, 2048, 512, 512, 2048, 2048] + - [30, 10519.0] + - - [512, 3383, 1, 2048, 512, 512, 2048, 2048] + - [49, 10529.0] + - - [512, 3384, 1, 2048, 512, 512, 2048, 2048] + - [14, 10519.0] + - - [512, 3385, 1, 2048, 512, 512, 2048, 2048] + - [14, 10524.0] + - - [512, 3386, 1, 2048, 512, 512, 2048, 2048] + - [14, 10541.0] + - - [512, 3388, 1, 2048, 512, 512, 2048, 2048] + - [30, 10529.0] + - - [512, 3390, 1, 2048, 512, 512, 2048, 2048] + - [49, 10546.0] + - - [512, 3391, 1, 2048, 512, 512, 2048, 2048] + - [30, 10533.0] + - - [512, 3402, 1, 2048, 512, 512, 2048, 2048] + - [49, 10583.0] + - - [512, 3410, 1, 2048, 512, 512, 2048, 2048] + - [49, 10588.0] + - - [512, 3412, 1, 2048, 512, 512, 2048, 2048] + - [30, 10605.0] + - - [512, 3414, 1, 2048, 512, 512, 2048, 2048] + - [14, 10621.0] + - - [512, 3415, 1, 2048, 512, 512, 2048, 2048] + - [30, 10601.0] + - - [512, 3418, 1, 2048, 512, 512, 2048, 2048] + - [49, 10617.0] + - - [512, 3420, 1, 2048, 512, 512, 2048, 2048] + - [30, 10646.0] + - - [512, 3422, 1, 2048, 512, 512, 2048, 2048] + - [30, 10641.0] + - - [512, 3425, 1, 2048, 512, 512, 2048, 2048] + - [30, 10649.0] + - - [512, 3426, 1, 2048, 512, 512, 2048, 2048] + - [49, 10670.0] + - - [512, 3427, 1, 2048, 512, 512, 2048, 2048] + - [14, 10649.0] + - - [512, 3428, 1, 2048, 512, 512, 2048, 2048] + - [49, 10660.0] + - - [512, 3430, 1, 2048, 512, 512, 2048, 2048] + - [30, 10671.0] + - - [512, 3431, 1, 2048, 512, 512, 2048, 2048] + - [14, 10660.0] + - - [512, 3432, 1, 2048, 512, 512, 2048, 2048] + - [30, 10687.0] + - - [512, 3438, 1, 2048, 512, 512, 2048, 2048] + - [30, 10663.0] + - - [512, 3439, 1, 2048, 512, 512, 2048, 2048] + - [30, 10685.0] + - - [512, 3440, 1, 2048, 512, 512, 2048, 2048] + - [30, 10700.0] + - - [512, 3443, 1, 2048, 512, 512, 2048, 2048] + - [30, 10722.0] + - - [512, 3445, 1, 2048, 512, 512, 2048, 2048] + - [14, 10694.0] + - - [512, 3447, 1, 2048, 512, 512, 2048, 2048] + - [14, 10711.0] + - - [512, 3448, 1, 2048, 512, 512, 2048, 2048] + - [49, 10715.0] + - - [512, 3450, 1, 2048, 512, 512, 2048, 2048] + - [14, 10719.0] + - - [512, 3452, 1, 2048, 512, 512, 2048, 2048] + - [49, 10712.0] + - - [512, 3453, 1, 2048, 512, 512, 2048, 2048] + - [30, 10727.0] + - - [512, 3455, 1, 2048, 512, 512, 2048, 2048] + - [30, 10712.0] + - - [512, 3457, 1, 2048, 512, 512, 2048, 2048] + - [49, 10741.0] + - - [512, 3459, 1, 2048, 512, 512, 2048, 2048] + - [30, 10740.0] + - - [512, 3460, 1, 2048, 512, 512, 2048, 2048] + - [49, 10733.0] + - - [512, 3461, 1, 2048, 512, 512, 2048, 2048] + - [30, 10752.0] + - - [512, 3462, 1, 2048, 512, 512, 2048, 2048] + - [49, 10770.0] + - - [512, 3466, 1, 2048, 512, 512, 2048, 2048] + - [14, 10747.0] + - - [512, 3471, 1, 2048, 512, 512, 2048, 2048] + - [30, 10785.0] + - - [512, 3472, 1, 2048, 512, 512, 2048, 2048] + - [14, 10765.0] + - - [512, 3475, 1, 2048, 512, 512, 2048, 2048] + - [49, 10790.0] + - - [512, 3476, 1, 2048, 512, 512, 2048, 2048] + - [30, 10797.0] + - - [512, 3479, 1, 2048, 512, 512, 2048, 2048] + - [30, 10778.0] + - - [512, 3480, 1, 2048, 512, 512, 2048, 2048] + - [14, 10800.0] + - - [512, 3481, 1, 2048, 512, 512, 2048, 2048] + - [49, 10836.0] + - - [512, 3483, 1, 2048, 512, 512, 2048, 2048] + - [14, 10823.0] + - - [512, 3484, 1, 2048, 512, 512, 2048, 2048] + - [14, 10815.0] + - - [512, 3487, 1, 2048, 512, 512, 2048, 2048] + - [30, 10807.0] + - - [512, 3489, 1, 2048, 512, 512, 2048, 2048] + - [49, 10855.0] + - - [512, 3490, 1, 2048, 512, 512, 2048, 2048] + - [49, 10790.0] + - - [512, 3491, 1, 2048, 512, 512, 2048, 2048] + - [30, 10839.0] + - - [512, 3493, 1, 2048, 512, 512, 2048, 2048] + - [30, 10840.0] + - - [512, 3494, 1, 2048, 512, 512, 2048, 2048] + - [49, 10853.0] + - - [512, 3497, 1, 2048, 512, 512, 2048, 2048] + - [30, 10858.0] + - - [512, 3498, 1, 2048, 512, 512, 2048, 2048] + - [49, 10849.0] + - - [512, 3499, 1, 2048, 512, 512, 2048, 2048] + - [49, 10856.0] + - - [512, 3501, 1, 2048, 512, 512, 2048, 2048] + - [30, 10839.0] + - - [512, 3503, 1, 2048, 512, 512, 2048, 2048] + - [30, 10889.0] + - - [512, 3508, 1, 2048, 512, 512, 2048, 2048] + - [49, 10897.0] + - - [512, 3509, 1, 2048, 512, 512, 2048, 2048] + - [30, 10896.0] + - - [512, 3511, 1, 2048, 512, 512, 2048, 2048] + - [49, 10871.0] + - - [512, 3514, 1, 2048, 512, 512, 2048, 2048] + - [30, 10891.0] + - - [512, 3518, 1, 2048, 512, 512, 2048, 2048] + - [49, 10894.0] + - - [512, 3519, 1, 2048, 512, 512, 2048, 2048] + - [49, 10916.0] + - - [512, 3520, 1, 2048, 512, 512, 2048, 2048] + - [30, 10914.0] + - - [512, 3523, 1, 2048, 512, 512, 2048, 2048] + - [30, 10884.0] + - - [512, 3528, 1, 2048, 512, 512, 2048, 2048] + - [49, 10831.0] + - - [512, 3529, 1, 2048, 512, 512, 2048, 2048] + - [30, 10932.0] + - - [512, 3530, 1, 2048, 512, 512, 2048, 2048] + - [49, 10928.0] + - - [512, 3532, 1, 2048, 512, 512, 2048, 2048] + - [30, 10922.0] + - - [512, 3533, 1, 2048, 512, 512, 2048, 2048] + - [49, 10918.0] + - - [512, 3534, 1, 2048, 512, 512, 2048, 2048] + - [49, 10947.0] + - - [512, 3538, 1, 2048, 512, 512, 2048, 2048] + - [49, 10973.0] + - - [512, 3539, 1, 2048, 512, 512, 2048, 2048] + - [49, 10980.0] + - - [512, 3541, 1, 2048, 512, 512, 2048, 2048] + - [49, 10985.0] + - - [512, 3547, 1, 2048, 512, 512, 2048, 2048] + - [49, 10966.0] + - - [512, 3548, 1, 2048, 512, 512, 2048, 2048] + - [30, 11000.0] + - - [512, 3552, 1, 2048, 512, 512, 2048, 2048] + - [49, 10998.0] + - - [512, 3564, 1, 2048, 512, 512, 2048, 2048] + - [30, 11026.0] + - - [512, 3575, 1, 2048, 512, 512, 2048, 2048] + - [30, 11077.0] + - - [512, 3598, 1, 2048, 512, 512, 2048, 2048] + - [30, 11123.0] + - - [512, 3599, 1, 2048, 512, 512, 2048, 2048] + - [49, 11139.0] + - - [512, 3608, 1, 2048, 512, 512, 2048, 2048] + - [14, 11171.0] + - - [512, 3780, 1, 512, 512, 512, 512, 512] + - [28, 10582.0] + - - [512, 3780, 1, 2048, 512, 512, 2048, 2048] + - [30, 11647.0] + - - [512, 3796, 1, 512, 512, 512, 512, 512] + - [30, 10661.0] + - - [512, 3796, 1, 2048, 512, 512, 2048, 2048] + - [14, 11705.0] + - - [512, 3822, 1, 512, 512, 512, 512, 512] + - [28, 10693.0] + - - [512, 3822, 1, 2048, 512, 512, 2048, 2048] + - [49, 11775.0] + - - [512, 3840, 1, 512, 512, 512, 512, 512] + - [12, 11143.0] + - - [512, 3840, 1, 2048, 512, 512, 2048, 2048] + - [14, 11911.0] + - - [512, 3859, 1, 512, 512, 512, 512, 512] + - [28, 9682.0] + - - [512, 3859, 1, 2048, 512, 512, 2048, 2048] + - [28, 10112.0] + - - [512, 3870, 1, 512, 512, 512, 512, 512] + - [25, 9755.0] + - - [512, 3870, 1, 2048, 512, 512, 2048, 2048] + - [28, 10180.0] + - - [512, 3876, 1, 512, 512, 512, 512, 512] + - [44, 9804.0] + - - [512, 3876, 1, 2048, 512, 512, 2048, 2048] + - [47, 10161.0] + - - [512, 3906, 1, 512, 512, 512, 512, 512] + - [25, 9834.0] + - - [512, 3906, 1, 2048, 512, 512, 2048, 2048] + - [28, 10230.0] + - - [512, 3910, 1, 512, 512, 512, 512, 512] + - [25, 9911.0] + - - [512, 3910, 1, 2048, 512, 512, 2048, 2048] + - [47, 10239.0] + - - [512, 3925, 1, 512, 512, 512, 512, 512] + - [47, 9854.0] + - - [512, 3925, 1, 2048, 512, 512, 2048, 2048] + - [28, 10284.0] + - - [512, 3927, 1, 512, 512, 512, 512, 512] + - [25, 9866.0] + - - [512, 3942, 1, 512, 512, 512, 512, 512] + - [47, 9893.0] + - - [512, 3942, 1, 2048, 512, 512, 2048, 2048] + - [47, 10360.0] + - - [512, 3944, 1, 512, 512, 512, 512, 512] + - [25, 9966.0] + - - [512, 3944, 1, 2048, 512, 512, 2048, 2048] + - [47, 10350.0] + - - [512, 3955, 1, 512, 512, 512, 512, 512] + - [47, 9887.0] + - - [512, 3955, 1, 2048, 512, 512, 2048, 2048] + - [12, 10348.0] + - - [512, 3968, 1, 512, 512, 512, 512, 512] + - [47, 10140.0] + - - [512, 3968, 1, 2048, 512, 512, 2048, 2048] + - [28, 10470.0] + - - [512, 3969, 1, 512, 512, 512, 512, 512] + - [28, 9958.0] + - - [512, 3969, 1, 2048, 512, 512, 2048, 2048] + - [47, 10420.0] + - - [512, 3976, 1, 512, 512, 512, 512, 512] + - [47, 9968.0] + - - [512, 3976, 1, 2048, 512, 512, 2048, 2048] + - [28, 10372.0] + - - [512, 3977, 1, 512, 512, 512, 512, 512] + - [47, 9973.0] + - - [512, 3977, 1, 2048, 512, 512, 2048, 2048] + - [28, 10397.0] + - - [512, 3978, 1, 512, 512, 512, 512, 512] + - [47, 10000.0] + - - [512, 3978, 1, 2048, 512, 512, 2048, 2048] + - [12, 10404.0] + - - [512, 3990, 1, 512, 512, 512, 512, 512] + - [28, 10032.0] + - - [512, 3990, 1, 2048, 512, 512, 2048, 2048] + - [28, 10428.0] + - - [512, 3995, 1, 512, 512, 512, 512, 512] + - [28, 10020.0] + - - [512, 3995, 1, 2048, 512, 512, 2048, 2048] + - [47, 10467.0] + - - [512, 3996, 1, 512, 512, 512, 512, 512] + - [47, 10121.0] + - - [512, 3996, 1, 2048, 512, 512, 2048, 2048] + - [12, 10447.0] + - - [512, 3999, 1, 512, 512, 512, 512, 512] + - [28, 9988.0] + - - [512, 3999, 1, 2048, 512, 512, 2048, 2048] + - [12, 10455.0] + - - [512, 4005, 1, 512, 512, 512, 512, 512] + - [47, 10068.0] + - - [512, 4005, 1, 2048, 512, 512, 2048, 2048] + - [28, 10467.0] + - - [512, 4012, 1, 512, 512, 512, 512, 512] + - [47, 10066.0] + - - [512, 4012, 1, 2048, 512, 512, 2048, 2048] + - [47, 10571.0] + - - [512, 4020, 1, 512, 512, 512, 512, 512] + - [25, 10046.0] + - - [512, 4020, 1, 2048, 512, 512, 2048, 2048] + - [12, 10512.0] + - - [512, 4026, 1, 512, 512, 512, 512, 512] + - [28, 10078.0] + - - [512, 4026, 1, 2048, 512, 512, 2048, 2048] + - [12, 10516.0] + - - [512, 4030, 1, 512, 512, 512, 512, 512] + - [47, 10162.0] + - - [512, 4030, 1, 2048, 512, 512, 2048, 2048] + - [12, 10532.0] + - - [512, 4032, 1, 512, 512, 512, 512, 512] + - [28, 10095.0] + - - [512, 4032, 1, 2048, 512, 512, 2048, 2048] + - [47, 10573.0] + - - [512, 4050, 1, 512, 512, 512, 512, 512] + - [25, 10212.0] + - - [512, 4059, 1, 512, 512, 512, 512, 512] + - [47, 10204.0] + - - [2048, 2790, 1, 512, 2048, 2048, 512, 512] + - [12, 11388.0] + - - [2048, 3092, 1, 512, 2048, 2048, 512, 512] + - [12, 11400.0] + - - [2048, 3113, 1, 512, 2048, 2048, 512, 512] + - [12, 11460.0] + - - [2048, 3137, 1, 512, 2048, 2048, 512, 512] + - [12, 11542.0] + - - [2048, 3165, 1, 512, 2048, 2048, 512, 512] + - [12, 11599.0] + - - [2048, 3166, 1, 512, 2048, 2048, 512, 512] + - [12, 11638.0] + - - [2048, 3194, 1, 512, 2048, 2048, 512, 512] + - [12, 11705.0] + - - [2048, 3219, 1, 512, 2048, 2048, 512, 512] + - [12, 11322.0] + - - [2048, 3222, 1, 512, 2048, 2048, 512, 512] + - [12, 11349.0] + - - [2048, 3234, 1, 512, 2048, 2048, 512, 512] + - [12, 11376.0] + - - [2048, 3237, 1, 512, 2048, 2048, 512, 512] + - [12, 11395.0] + - - [2048, 3242, 1, 512, 2048, 2048, 512, 512] + - [12, 11399.0] + - - [2048, 3246, 1, 512, 2048, 2048, 512, 512] + - [12, 11419.0] + - - [2048, 3249, 1, 512, 2048, 2048, 512, 512] + - [12, 11409.0] + - - [2048, 3251, 1, 512, 2048, 2048, 512, 512] + - [12, 11420.0] + - - [2048, 3257, 1, 512, 2048, 2048, 512, 512] + - [12, 11460.0] + - - [2048, 3262, 1, 512, 2048, 2048, 512, 512] + - [12, 11467.0] + - - [2048, 3268, 1, 512, 2048, 2048, 512, 512] + - [12, 11498.0] + - - [2048, 3282, 1, 512, 2048, 2048, 512, 512] + - [12, 11547.0] + - - [2048, 3286, 1, 512, 2048, 2048, 512, 512] + - [12, 11536.0] + - - [2048, 3293, 1, 512, 2048, 2048, 512, 512] + - [12, 11575.0] + - - [2048, 3297, 1, 512, 2048, 2048, 512, 512] + - [28, 11590.0] + - - [2048, 3307, 1, 512, 2048, 2048, 512, 512] + - [12, 11603.0] + - - [2048, 3314, 1, 512, 2048, 2048, 512, 512] + - [12, 11637.0] + - - [2048, 3315, 1, 512, 2048, 2048, 512, 512] + - [12, 11619.0] + - - [2048, 3319, 1, 512, 2048, 2048, 512, 512] + - [28, 11636.0] + - - [2048, 3322, 1, 512, 2048, 2048, 512, 512] + - [28, 11634.0] + - - [2048, 3323, 1, 512, 2048, 2048, 512, 512] + - [28, 11648.0] + - - [2048, 3324, 1, 512, 2048, 2048, 512, 512] + - [12, 11654.0] + - - [2048, 3325, 1, 512, 2048, 2048, 512, 512] + - [28, 11661.0] + - - [2048, 3327, 1, 512, 2048, 2048, 512, 512] + - [12, 11650.0] + - - [2048, 3329, 1, 512, 2048, 2048, 512, 512] + - [12, 11198.0] + - - [2048, 3332, 1, 512, 2048, 2048, 512, 512] + - [12, 11238.0] + - - [2048, 3336, 1, 512, 2048, 2048, 512, 512] + - [12, 11258.0] + - - [2048, 3339, 1, 512, 2048, 2048, 512, 512] + - [12, 11254.0] + - - [2048, 3342, 1, 512, 2048, 2048, 512, 512] + - [12, 11287.0] + - - [2048, 3344, 1, 512, 2048, 2048, 512, 512] + - [12, 11289.0] + - - [2048, 3358, 1, 512, 2048, 2048, 512, 512] + - [12, 11334.0] + - - [2048, 3360, 1, 512, 2048, 2048, 512, 512] + - [12, 11348.0] + - - [2048, 3364, 1, 512, 2048, 2048, 512, 512] + - [28, 11371.0] + - - [2048, 3365, 1, 512, 2048, 2048, 512, 512] + - [12, 11354.0] + - - [2048, 3369, 1, 512, 2048, 2048, 512, 512] + - [12, 11346.0] + - - [2048, 3371, 1, 512, 2048, 2048, 512, 512] + - [12, 11389.0] + - - [2048, 3374, 1, 512, 2048, 2048, 512, 512] + - [12, 11380.0] + - - [2048, 3376, 1, 512, 2048, 2048, 512, 512] + - [12, 11378.0] + - - [2048, 3377, 1, 512, 2048, 2048, 512, 512] + - [12, 11396.0] + - - [2048, 3378, 1, 512, 2048, 2048, 512, 512] + - [28, 11378.0] + - - [2048, 3381, 1, 512, 2048, 2048, 512, 512] + - [12, 11407.0] + - - [2048, 3382, 1, 512, 2048, 2048, 512, 512] + - [12, 11398.0] + - - [2048, 3383, 1, 512, 2048, 2048, 512, 512] + - [12, 11394.0] + - - [2048, 3384, 1, 512, 2048, 2048, 512, 512] + - [12, 11444.0] + - - [2048, 3385, 1, 512, 2048, 2048, 512, 512] + - [12, 11436.0] + - - [2048, 3386, 1, 512, 2048, 2048, 512, 512] + - [12, 11408.0] + - - [2048, 3388, 1, 512, 2048, 2048, 512, 512] + - [12, 11416.0] + - - [2048, 3390, 1, 512, 2048, 2048, 512, 512] + - [12, 11436.0] + - - [2048, 3391, 1, 512, 2048, 2048, 512, 512] + - [28, 11405.0] + - - [2048, 3396, 1, 512, 2048, 2048, 512, 512] + - [28, 11423.0] + - - [2048, 3399, 1, 512, 2048, 2048, 512, 512] + - [12, 11448.0] + - - [2048, 3402, 1, 512, 2048, 2048, 512, 512] + - [12, 11475.0] + - - [2048, 3410, 1, 512, 2048, 2048, 512, 512] + - [12, 11485.0] + - - [2048, 3414, 1, 512, 2048, 2048, 512, 512] + - [12, 11508.0] + - - [2048, 3415, 1, 512, 2048, 2048, 512, 512] + - [12, 11513.0] + - - [2048, 3418, 1, 512, 2048, 2048, 512, 512] + - [12, 11510.0] + - - [2048, 3420, 1, 512, 2048, 2048, 512, 512] + - [12, 11542.0] + - - [2048, 3422, 1, 512, 2048, 2048, 512, 512] + - [12, 11522.0] + - - [2048, 3425, 1, 512, 2048, 2048, 512, 512] + - [28, 11549.0] + - - [2048, 3426, 1, 512, 2048, 2048, 512, 512] + - [12, 11525.0] + - - [2048, 3427, 1, 512, 2048, 2048, 512, 512] + - [12, 11511.0] + - - [2048, 3428, 1, 512, 2048, 2048, 512, 512] + - [28, 11532.0] + - - [2048, 3430, 1, 512, 2048, 2048, 512, 512] + - [28, 11544.0] + - - [2048, 3431, 1, 512, 2048, 2048, 512, 512] + - [12, 11547.0] + - - [2048, 3432, 1, 512, 2048, 2048, 512, 512] + - [12, 11567.0] + - - [2048, 3438, 1, 512, 2048, 2048, 512, 512] + - [12, 11557.0] + - - [2048, 3439, 1, 512, 2048, 2048, 512, 512] + - [12, 11556.0] + - - [2048, 3440, 1, 512, 2048, 2048, 512, 512] + - [12, 11577.0] + - - [2048, 3443, 1, 512, 2048, 2048, 512, 512] + - [12, 11576.0] + - - [2048, 3445, 1, 512, 2048, 2048, 512, 512] + - [28, 11576.0] + - - [2048, 3447, 1, 512, 2048, 2048, 512, 512] + - [12, 11608.0] + - - [2048, 3448, 1, 512, 2048, 2048, 512, 512] + - [12, 11601.0] + - - [2048, 3450, 1, 512, 2048, 2048, 512, 512] + - [12, 11603.0] + - - [2048, 3451, 1, 512, 2048, 2048, 512, 512] + - [12, 11600.0] + - - [2048, 3452, 1, 512, 2048, 2048, 512, 512] + - [12, 11577.0] + - - [2048, 3453, 1, 512, 2048, 2048, 512, 512] + - [12, 11642.0] + - - [2048, 3455, 1, 512, 2048, 2048, 512, 512] + - [12, 11628.0] + - - [2048, 3457, 1, 512, 2048, 2048, 512, 512] + - [28, 11223.0] + - - [2048, 3458, 1, 512, 2048, 2048, 512, 512] + - [12, 11200.0] + - - [2048, 3459, 1, 512, 2048, 2048, 512, 512] + - [28, 11196.0] + - - [2048, 3460, 1, 512, 2048, 2048, 512, 512] + - [28, 11237.0] + - - [2048, 3461, 1, 512, 2048, 2048, 512, 512] + - [12, 11232.0] + - - [2048, 3462, 1, 512, 2048, 2048, 512, 512] + - [28, 11204.0] + - - [2048, 3467, 1, 512, 2048, 2048, 512, 512] + - [28, 11214.0] + - - [2048, 3468, 1, 512, 2048, 2048, 512, 512] + - [28, 11256.0] + - - [2048, 3470, 1, 512, 2048, 2048, 512, 512] + - [12, 11228.0] + - - [2048, 3471, 1, 512, 2048, 2048, 512, 512] + - [28, 11235.0] + - - [2048, 3472, 1, 512, 2048, 2048, 512, 512] + - [12, 11244.0] + - - [2048, 3475, 1, 512, 2048, 2048, 512, 512] + - [12, 11273.0] + - - [2048, 3477, 1, 512, 2048, 2048, 512, 512] + - [12, 11284.0] + - - [2048, 3478, 1, 512, 2048, 2048, 512, 512] + - [12, 11264.0] + - - [2048, 3479, 1, 512, 2048, 2048, 512, 512] + - [12, 11252.0] + - - [2048, 3480, 1, 512, 2048, 2048, 512, 512] + - [12, 11294.0] + - - [2048, 3481, 1, 512, 2048, 2048, 512, 512] + - [28, 11195.0] + - - [2048, 3483, 1, 512, 2048, 2048, 512, 512] + - [12, 11300.0] + - - [2048, 3484, 1, 512, 2048, 2048, 512, 512] + - [12, 11264.0] + - - [2048, 3487, 1, 512, 2048, 2048, 512, 512] + - [12, 11287.0] + - - [2048, 3489, 1, 512, 2048, 2048, 512, 512] + - [28, 11244.0] + - - [2048, 3490, 1, 512, 2048, 2048, 512, 512] + - [28, 11326.0] + - - [2048, 3491, 1, 512, 2048, 2048, 512, 512] + - [12, 11272.0] + - - [2048, 3493, 1, 512, 2048, 2048, 512, 512] + - [28, 11327.0] + - - [2048, 3494, 1, 512, 2048, 2048, 512, 512] + - [12, 11312.0] + - - [2048, 3495, 1, 512, 2048, 2048, 512, 512] + - [12, 11320.0] + - - [2048, 3497, 1, 512, 2048, 2048, 512, 512] + - [12, 11326.0] + - - [2048, 3498, 1, 512, 2048, 2048, 512, 512] + - [12, 11331.0] + - - [2048, 3499, 1, 512, 2048, 2048, 512, 512] + - [12, 11334.0] + - - [2048, 3501, 1, 512, 2048, 2048, 512, 512] + - [12, 11338.0] + - - [2048, 3503, 1, 512, 2048, 2048, 512, 512] + - [28, 11336.0] + - - [2048, 3507, 1, 512, 2048, 2048, 512, 512] + - [12, 11348.0] + - - [2048, 3508, 1, 512, 2048, 2048, 512, 512] + - [12, 11347.0] + - - [2048, 3509, 1, 512, 2048, 2048, 512, 512] + - [12, 11365.0] + - - [2048, 3511, 1, 512, 2048, 2048, 512, 512] + - [12, 11367.0] + - - [2048, 3514, 1, 512, 2048, 2048, 512, 512] + - [12, 11394.0] + - - [2048, 3515, 1, 512, 2048, 2048, 512, 512] + - [12, 11383.0] + - - [2048, 3517, 1, 512, 2048, 2048, 512, 512] + - [28, 11383.0] + - - [2048, 3518, 1, 512, 2048, 2048, 512, 512] + - [12, 11383.0] + - - [2048, 3519, 1, 512, 2048, 2048, 512, 512] + - [12, 11411.0] + - - [2048, 3520, 1, 512, 2048, 2048, 512, 512] + - [12, 11381.0] + - - [2048, 3523, 1, 512, 2048, 2048, 512, 512] + - [12, 11402.0] + - - [2048, 3528, 1, 512, 2048, 2048, 512, 512] + - [28, 11463.0] + - - [2048, 3529, 1, 512, 2048, 2048, 512, 512] + - [12, 11422.0] + - - [2048, 3530, 1, 512, 2048, 2048, 512, 512] + - [12, 11449.0] + - - [2048, 3532, 1, 512, 2048, 2048, 512, 512] + - [28, 11446.0] + - - [2048, 3533, 1, 512, 2048, 2048, 512, 512] + - [12, 11434.0] + - - [2048, 3534, 1, 512, 2048, 2048, 512, 512] + - [12, 11439.0] + - - [2048, 3538, 1, 512, 2048, 2048, 512, 512] + - [12, 11444.0] + - - [2048, 3539, 1, 512, 2048, 2048, 512, 512] + - [28, 11487.0] + - - [2048, 3541, 1, 512, 2048, 2048, 512, 512] + - [12, 11468.0] + - - [2048, 3547, 1, 512, 2048, 2048, 512, 512] + - [12, 11467.0] + - - [2048, 3548, 1, 512, 2048, 2048, 512, 512] + - [12, 11475.0] + - - [2048, 3552, 1, 512, 2048, 2048, 512, 512] + - [12, 11480.0] + - - [2048, 3564, 1, 512, 2048, 2048, 512, 512] + - [12, 11507.0] + - - [2048, 3575, 1, 512, 2048, 2048, 512, 512] + - [12, 11582.0] + - - [2048, 3598, 1, 512, 2048, 2048, 512, 512] + - [12, 11237.0] + - - [2048, 3599, 1, 512, 2048, 2048, 512, 512] + - [12, 11228.0] + - - [2048, 3608, 1, 512, 2048, 2048, 512, 512] + - [12, 11238.0] + - - [2048, 3780, 1, 512, 2048, 2048, 512, 512] + - [12, 11678.0] + - - [2048, 3796, 1, 512, 2048, 2048, 512, 512] + - [12, 11708.0] + - - [2048, 3822, 1, 512, 2048, 2048, 512, 512] + - [12, 11778.0] + - - [2048, 3840, 1, 512, 2048, 2048, 512, 512] + - [12, 11916.0] + - - [2048, 3859, 1, 512, 2048, 2048, 512, 512] + - [28, 11466.0] + - - [2048, 3870, 1, 512, 2048, 2048, 512, 512] + - [12, 11527.0] + - - [2048, 3876, 1, 512, 2048, 2048, 512, 512] + - [12, 11543.0] + - - [2048, 3906, 1, 512, 2048, 2048, 512, 512] + - [12, 11609.0] + - - [2048, 3910, 1, 512, 2048, 2048, 512, 512] + - [12, 11644.0] + - - [2048, 3925, 1, 512, 2048, 2048, 512, 512] + - [12, 11667.0] + - - [2048, 3942, 1, 512, 2048, 2048, 512, 512] + - [12, 11700.0] + - - [2048, 3944, 1, 512, 2048, 2048, 512, 512] + - [12, 11708.0] + - - [2048, 3955, 1, 512, 2048, 2048, 512, 512] + - [12, 11725.0] + - - [2048, 3968, 1, 512, 2048, 2048, 512, 512] + - [12, 11853.0] + - - [2048, 3969, 1, 512, 2048, 2048, 512, 512] + - [12, 11408.0] + - - [2048, 3976, 1, 512, 2048, 2048, 512, 512] + - [28, 11437.0] + - - [2048, 3977, 1, 512, 2048, 2048, 512, 512] + - [12, 11411.0] + - - [2048, 3978, 1, 512, 2048, 2048, 512, 512] + - [12, 11422.0] + - - [2048, 3990, 1, 512, 2048, 2048, 512, 512] + - [12, 11454.0] + - - [2048, 3995, 1, 512, 2048, 2048, 512, 512] + - [12, 11431.0] + - - [2048, 3996, 1, 512, 2048, 2048, 512, 512] + - [12, 11456.0] + - - [2048, 4005, 1, 512, 2048, 2048, 512, 512] + - [12, 11489.0] + - - [2048, 4012, 1, 512, 2048, 2048, 512, 512] + - [12, 11519.0] + - - [2048, 4020, 1, 512, 2048, 2048, 512, 512] + - [12, 11520.0] + - - [2048, 4026, 1, 512, 2048, 2048, 512, 512] + - [12, 11549.0] + - - [2048, 4030, 1, 512, 2048, 2048, 512, 512] + - [12, 11559.0] + - - [2048, 4032, 1, 512, 2048, 2048, 512, 512] + - [12, 11578.0] + - - [33708, 184, 1, 512, 33708, 33708, 512, 512] + - [46, 9386.0] + - - [33708, 208, 1, 512, 33708, 33708, 512, 512] + - [12, 9435.0] + - - [33708, 246, 1, 512, 33708, 33708, 512, 512] + - [28, 11089.0] + - - [33708, 264, 1, 512, 33708, 33708, 512, 512] + - [27, 8256.0] + - - [33708, 465, 1, 512, 33708, 33708, 512, 512] + - [12, 10867.0] + - - [33708, 468, 1, 512, 33708, 33708, 512, 512] + - [12, 10941.0] + - - [33708, 493, 1, 512, 33708, 33708, 512, 512] + - [28, 11502.0] + - - [33708, 540, 1, 512, 33708, 33708, 512, 512] + - [12, 10179.0] + - - [33708, 550, 1, 512, 33708, 33708, 512, 512] + - [28, 10380.0] + - - [33708, 560, 1, 512, 33708, 33708, 512, 512] + - [28, 10566.0] + - - [33708, 644, 1, 512, 33708, 33708, 512, 512] + - [12, 10107.0] + - - [33708, 714, 1, 512, 33708, 33708, 512, 512] + - [47, 11176.0] + - - [33708, 720, 1, 512, 33708, 33708, 512, 512] + - [12, 11275.0] + - - [33708, 781, 1, 512, 33708, 33708, 512, 512] + - [12, 10542.0] + - - [33708, 936, 1, 512, 33708, 33708, 512, 512] + - [12, 11080.0] + - - [33708, 980, 1, 512, 33708, 33708, 512, 512] + - [12, 11583.0] + - - [33708, 1232, 1, 512, 33708, 33708, 512, 512] + - [12, 11677.0] + - - [33708, 1290, 1, 512, 33708, 33708, 512, 512] + - [12, 11136.0] + - - [33708, 1350, 1, 512, 33708, 33708, 512, 512] + - [12, 11635.0] + - - [33708, 1424, 1, 512, 33708, 33708, 512, 512] + - [12, 11284.0] + - - [33708, 1458, 1, 512, 33708, 33708, 512, 512] + - [12, 11541.0] + - - [33708, 1462, 1, 512, 33708, 33708, 512, 512] + - [12, 11582.0] + - - [33708, 1520, 1, 512, 33708, 33708, 512, 512] + - [12, 12025.0] + - - [33708, 1596, 1, 512, 33708, 33708, 512, 512] + - [28, 11679.0] + - - [33708, 1599, 1, 512, 33708, 33708, 512, 512] + - [12, 11703.0] + - - [33708, 1615, 1, 512, 33708, 33708, 512, 512] + - [12, 11818.0] + - - [33708, 1680, 1, 512, 33708, 33708, 512, 512] + - [12, 11426.0] + - - [33708, 1917, 1, 512, 33708, 33708, 512, 512] + - [12, 12142.0] + - - [33708, 2205, 1, 512, 33708, 33708, 512, 512] + - [28, 11666.0] + - - [33708, 2418, 1, 512, 33708, 33708, 512, 512] + - [12, 12124.0] + - - [33708, 3776, 1, 512, 33708, 33708, 512, 512] + - [14, 12019.0] + - - [33708, 3780, 1, 512, 33708, 33708, 512, 512] + - [12, 12032.0] + - - [33708, 3796, 1, 512, 33708, 33708, 512, 512] + - [12, 12080.0] + - - [33708, 3822, 1, 512, 33708, 33708, 512, 512] + - [12, 12161.0] + - - [33708, 3835, 1, 512, 33708, 33708, 512, 512] + - [12, 12201.0] + - - [33708, 3840, 1, 512, 33708, 33708, 512, 512] + - [6, 12224.0] + - - [33708, 3859, 1, 512, 33708, 33708, 512, 512] + - [12, 11887.0] + - - [33708, 3870, 1, 512, 33708, 33708, 512, 512] + - [12, 11927.0] + - - [33708, 3876, 1, 512, 33708, 33708, 512, 512] + - [12, 11950.0] + - - [33708, 3906, 1, 512, 33708, 33708, 512, 512] + - [12, 12027.0] + - - [33708, 3910, 1, 512, 33708, 33708, 512, 512] + - [12, 12035.0] + - - [33708, 3925, 1, 512, 33708, 33708, 512, 512] + - [47, 12083.0] + - - [33708, 3942, 1, 512, 33708, 33708, 512, 512] + - [28, 12133.0] + - - [33708, 3944, 1, 512, 33708, 33708, 512, 512] + - [12, 12144.0] + - - [33708, 3955, 1, 512, 33708, 33708, 512, 512] + - [12, 12175.0] + - - [33708, 3968, 1, 512, 33708, 33708, 512, 512] + - [47, 12215.0] + - - [33708, 3976, 1, 512, 33708, 33708, 512, 512] + - [14, 11877.0] + - - [33708, 3977, 1, 512, 33708, 33708, 512, 512] + - [12, 11868.0] + - - [33708, 3978, 1, 512, 33708, 33708, 512, 512] + - [49, 11885.0] + - - [33708, 3990, 1, 512, 33708, 33708, 512, 512] + - [6, 11938.0] + - - [33708, 3996, 1, 512, 33708, 33708, 512, 512] + - [6, 11957.0] + - - [33708, 3999, 1, 512, 33708, 33708, 512, 512] + - [14, 11964.0] + - - [33708, 4005, 1, 512, 33708, 33708, 512, 512] + - [6, 11965.0] + - - [33708, 4012, 1, 512, 33708, 33708, 512, 512] + - [28, 9923.0] + - - [33708, 4020, 1, 512, 33708, 33708, 512, 512] + - [12, 9721.0] + - - [33708, 4026, 1, 512, 33708, 33708, 512, 512] + - [12, 9802.0] + - - [33708, 4030, 1, 512, 33708, 33708, 512, 512] + - [9, 10796.0] + - - [33708, 4032, 1, 512, 33708, 33708, 512, 512] + - [14, 10072.0] + - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] + - [28, 10567.0] + - - [511, 8192, 1, 8192, 511, 511, 8192, 8192] + - [32, 6989.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [38, 6821.0] + - - [8192, 8193, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 10593.0] + - - [3072, 3072, 1, 3071, 3072, 3072, 3071, 3071] + - [22, 10584.0] + - - [8192, 8192, 1, 8193, 8192, 8192, 8193, 8193] + - [22, 12186.0] + - - [7681, 8192, 1, 8192, 7681, 7681, 8192, 8192] + - [14, 10328.0] + - - [7680, 8192, 1, 8193, 7680, 7680, 8193, 8193] + - [22, 12096.0] + - - [513, 4096, 1, 4096, 513, 513, 4096, 4096] + - [49, 8065.0] + - - [3073, 512, 1, 3072, 3073, 3073, 3072, 3072] + - [12, 10233.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [14, 10699.0] + - - [4096, 4096, 1, 4097, 4096, 4096, 4097, 4097] + - [22, 11467.0] + - - [8192, 8191, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 10787.0] + - - [8192, 512, 1, 8193, 8192, 8192, 8193, 8193] + - [40, 8611.0] + - - [2880, 3071, 1, 3072, 2880, 2880, 3072, 3072] + - [50, 8369.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [50, 8278.0] + - - [4096, 511, 1, 4096, 4096, 4096, 4096, 4096] + - [28, 8280.0] + - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] + - [28, 10548.0] + - - [512, 8191, 1, 8192, 512, 512, 8192, 8192] + - [50, 7350.0] + - - [4096, 4095, 1, 4096, 4096, 4096, 4096, 4096] + - [47, 7183.0] + - - [8192, 511, 1, 8192, 8192, 8192, 8192, 8192] + - [51, 6168.0] + - - [8192, 512, 1, 8192, 8192, 8192, 8192, 8192] + - [33, 6463.0] + - - [511, 3072, 1, 3072, 511, 511, 3072, 3072] + - [47, 10244.0] + - - [7680, 8193, 1, 8192, 7680, 7680, 8192, 8192] + - [14, 10466.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 11395.0] + - - [3072, 512, 1, 3073, 3072, 3072, 3073, 3073] + - [17, 11015.0] + - - [513, 8192, 1, 8192, 513, 513, 8192, 8192] + - [47, 6723.0] + - - [7679, 8192, 1, 8192, 7679, 7679, 8192, 8192] + - [49, 10647.0] + - - [3840, 4096, 1, 4097, 3840, 3840, 4097, 4097] + - [22, 11527.0] + - - [512, 3072, 1, 3071, 512, 512, 3071, 3071] + - [17, 10861.0] + - - [7680, 8192, 1, 8191, 7680, 7680, 8191, 8191] + - [22, 12149.0] + - - [3072, 511, 1, 3072, 3072, 3072, 3072, 3072] + - [28, 10379.0] + - - [8193, 8192, 1, 8192, 8193, 8193, 8192, 8192] + - [14, 10248.0] + - - [512, 4096, 1, 4095, 512, 512, 4095, 4095] + - [20, 10029.0] + - - [512, 3071, 1, 3072, 512, 512, 3072, 3072] + - [47, 10441.0] + - - [3073, 3072, 1, 3072, 3073, 3073, 3072, 3072] + - [50, 7909.0] + - - [512, 3073, 1, 3072, 512, 512, 3072, 3072] + - [28, 10044.0] + - - [4096, 4096, 1, 4095, 4096, 4096, 4095, 4095] + - [6, 11468.0] + - - [1920, 2048, 1, 2047, 1920, 1920, 2047, 2047] + - [17, 11962.0] + - - [1920, 2049, 1, 2048, 1920, 1920, 2048, 2048] + - [12, 10736.0] + - - [512, 8192, 1, 8191, 512, 512, 8191, 8191] + - [6, 8701.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [50, 8974.0] + - - [8191, 512, 1, 8192, 8191, 8191, 8192, 8192] + - [51, 6298.0] + - - [2881, 3072, 1, 3072, 2881, 2881, 3072, 3072] + - [50, 8574.0] + - - [512, 4096, 1, 4096, 512, 512, 4096, 4096] + - [48, 8795.0] + - - [3841, 4096, 1, 4096, 3841, 3841, 4096, 4096] + - [38, 10285.0] + - - [2880, 3072, 1, 3073, 2880, 2880, 3073, 3073] + - [14, 10322.0] + - - [4095, 512, 1, 4096, 4095, 4095, 4096, 4096] + - [14, 8458.0] + - - [1919, 2048, 1, 2048, 1919, 1919, 2048, 2048] + - [12, 11552.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [47, 11782.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 10839.0] + - - [511, 4096, 1, 4096, 511, 511, 4096, 4096] + - [29, 8354.0] + - - [8192, 513, 1, 8192, 8192, 8192, 8192, 8192] + - [41, 5886.0] + - - [513, 3072, 1, 3072, 513, 513, 3072, 3072] + - [30, 9180.0] + - - [7680, 8191, 1, 8192, 7680, 7680, 8192, 8192] + - [14, 10353.0] + - - [512, 4097, 1, 4096, 512, 512, 4096, 4096] + - [49, 8530.0] + - - [2047, 2048, 1, 2048, 2047, 2047, 2048, 2048] + - [12, 11353.0] + - - [2049, 2048, 1, 2048, 2049, 2049, 2048, 2048] + - [49, 11003.0] + - - [3840, 4095, 1, 4096, 3840, 3840, 4096, 4096] + - [47, 8442.0] + - - [2880, 3072, 1, 3071, 2880, 2880, 3071, 3071] + - [22, 10761.0] + - - [3072, 3072, 1, 3073, 3072, 3072, 3073, 3073] + - [4, 11942.0] + - - [2880, 3073, 1, 3072, 2880, 2880, 3072, 3072] + - [50, 7996.0] + - - [4096, 513, 1, 4096, 4096, 4096, 4096, 4096] + - [41, 7444.0] + - - [4097, 512, 1, 4096, 4097, 4097, 4096, 4096] + - [32, 8064.0] + - - [8192, 512, 1, 8191, 8192, 8192, 8191, 8191] + - [22, 8543.0] + - - [1921, 2048, 1, 2048, 1921, 1921, 2048, 2048] + - [12, 10687.0] + - - [512, 3072, 1, 3073, 512, 512, 3073, 3073] + - [17, 11037.0] + - - [2048, 2049, 1, 2048, 2048, 2048, 2048, 2048] + - [28, 10772.0] + - - [3072, 512, 1, 3071, 3072, 3072, 3071, 3071] + - [35, 11061.0] + - - [3071, 3072, 1, 3072, 3071, 3071, 3072, 3072] + - [32, 8181.0] + - - [3840, 4097, 1, 4096, 3840, 3840, 4096, 4096] + - [50, 8773.0] + - - [2048, 2047, 1, 2048, 2048, 2048, 2048, 2048] + - [28, 11240.0] + - - [2879, 3072, 1, 3072, 2879, 2879, 3072, 3072] + - [50, 8579.0] + - - [3072, 513, 1, 3072, 3072, 3072, 3072, 3072] + - [47, 8954.0] + - - [512, 4095, 1, 4096, 512, 512, 4096, 4096] + - [29, 8561.0] + - - [3071, 512, 1, 3072, 3071, 3071, 3072, 3072] + - [28, 10425.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [28, 8244.0] + - - [4097, 4096, 1, 4096, 4097, 4097, 4096, 4096] + - [47, 7921.0] + - - [2048, 2048, 1, 2047, 2048, 2048, 2047, 2047] + - [25, 11615.0] + - - [3839, 4096, 1, 4096, 3839, 3839, 4096, 4096] + - [50, 8854.0] + - - [512, 4096, 1, 4097, 512, 512, 4097, 4097] + - [38, 10051.0] + - - [3072, 3073, 1, 3072, 3072, 3072, 3072, 3072] + - [50, 7542.0] + - - [2048, 2048, 1, 2049, 2048, 2048, 2049, 2049] + - [35, 11496.0] + - - [8191, 8192, 1, 8192, 8191, 8191, 8192, 8192] + - [14, 10800.0] + - - [3072, 3071, 1, 3072, 3072, 3072, 3072, 3072] + - [50, 7785.0] + - - [4096, 512, 1, 4097, 4096, 4096, 4097, 4097] + - [20, 9982.0] + - - [3840, 4096, 1, 4095, 3840, 3840, 4095, 4095] + - [22, 11544.0] + - - [1920, 2047, 1, 2048, 1920, 1920, 2048, 2048] + - [47, 11504.0] + - - [8192, 8192, 1, 8191, 8192, 8192, 8191, 8191] + - [30, 12168.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [32, 7855.0] + - - [512, 8193, 1, 8192, 512, 512, 8192, 8192] + - [50, 7539.0] + - - [4096, 512, 1, 4095, 4096, 4096, 4095, 4095] + - [20, 9947.0] + - - [8193, 512, 1, 8192, 8193, 8193, 8192, 8192] + - [33, 6444.0] + - - [4095, 4096, 1, 4096, 4095, 4095, 4096, 4096] + - [4, 6938.0] + - - [4096, 4097, 1, 4096, 4096, 4096, 4096, 4096] + - [29, 7267.0] + - - [512, 8192, 1, 8192, 512, 512, 8192, 8192] + - [32, 7354.0] + - - [512, 8192, 1, 8193, 512, 512, 8193, 8193] + - [6, 8537.0] + - - [1920, 2048, 1, 2049, 1920, 1920, 2049, 2049] + - [35, 11814.0] + - - [479, 3072, 1, 3072, 479, 479, 3072, 3072] + - [28, 9985.0] + - - [479, 4096, 1, 4096, 479, 479, 4096, 4096] + - [48, 8766.0] + - - [479, 8192, 1, 8192, 479, 479, 8192, 8192] + - [32, 7074.0] + - - [480, 3072, 1, 3071, 480, 480, 3071, 3071] + - [17, 10183.0] + - - [480, 3072, 1, 3073, 480, 480, 3073, 3073] + - [1, 10376.0] + - - [480, 3073, 1, 3072, 480, 480, 3072, 3072] + - [28, 10027.0] + - - [480, 4095, 1, 4096, 480, 480, 4096, 4096] + - [47, 8626.0] + - - [480, 4096, 1, 4095, 480, 480, 4095, 4095] + - [20, 9776.0] + - - [480, 4096, 1, 4097, 480, 480, 4097, 4097] + - [38, 9768.0] + - - [480, 4097, 1, 4096, 480, 480, 4096, 4096] + - [29, 8816.0] + - - [480, 8191, 1, 8192, 480, 480, 8192, 8192] + - [33, 6693.0] + - - [480, 8192, 1, 8191, 480, 480, 8191, 8191] + - [14, 8040.0] + - - [480, 8192, 1, 8193, 480, 480, 8193, 8193] + - [6, 8065.0] + - - [480, 8193, 1, 8192, 480, 480, 8192, 8192] + - [32, 6928.0] + - - [481, 3072, 1, 3072, 481, 481, 3072, 3072] + - [28, 9778.0] + - - [481, 4096, 1, 4096, 481, 481, 4096, 4096] + - [48, 8839.0] + - - [481, 8192, 1, 8192, 481, 481, 8192, 8192] + - [50, 6932.0] + - - [3072, 479, 1, 3072, 3072, 3072, 3072, 3072] + - [28, 9609.0] + - - [3072, 480, 1, 3071, 3072, 3072, 3071, 3071] + - [17, 10387.0] + - - [3072, 480, 1, 3073, 3072, 3072, 3073, 3073] + - [25, 10334.0] + - - [3072, 481, 1, 3072, 3072, 3072, 3072, 3072] + - [47, 10093.0] + - - [3073, 480, 1, 3072, 3073, 3073, 3072, 3072] + - [47, 10036.0] + - - [480, 3072, 1, 3072, 480, 480, 3072, 3072] + - [28, 10026.0] + - - [480, 4096, 1, 4096, 480, 480, 4096, 4096] + - [47, 8801.0] + - - [480, 8192, 1, 8192, 480, 480, 8192, 8192] + - [32, 6804.0] + - - [3072, 480, 1, 3072, 3072, 3072, 3072, 3072] + - [28, 9581.0] + - - [4096, 480, 1, 4096, 4096, 4096, 4096, 4096] + - [32, 7823.0] + - - [8192, 480, 1, 8192, 8192, 8192, 8192, 8192] + - [33, 6310.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 11050.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8725.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10486.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 4096, 4096] + - [50, 8335.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10795.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 7641.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11058.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8182.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10407.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8243.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9389.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8675.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8698.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8624.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8963.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9687.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9772.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9714.0] + - - [42720, 3968, 1, 1024, 42720, 42720, 1024, 1024] + - [28, 12248.0] + - - [42720, 7200, 1, 1024, 42720, 42720, 1024, 1024] + - [47, 12116.0] + - - [42720, 9520, 1, 1024, 42720, 42720, 1024, 1024] + - [47, 12197.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 9980.0] + - - [2048, 960, 1, 74, 2048, 2048, 74, 74] + - [0, 8390.0] + - - [1600, 1024, 1, 960, 1600, 1600, 960, 960] + - [35, 9849.0] + - - [2048, 2048, 1, 960, 2048, 2048, 960, 960] + - [17, 11981.0] + - - [4096, 1024, 1, 257, 4096, 4096, 257, 257] + - [17, 11060.0] + - - [10240, 8976, 1, 256, 10240, 10240, 256, 256] + - [12, 11986.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [11, 9635.0] + - - [1024, 1600, 1, 560, 1024, 1024, 560, 560] + - [34, 10548.0] + - - [10496, 8976, 1, 256, 10496, 10496, 256, 256] + - [12, 11990.0] + - - [11264, 8976, 1, 256, 11264, 11264, 256, 256] + - [12, 11999.0] + - - [11776, 8976, 1, 256, 11776, 11776, 256, 256] + - [20, 11998.0] + - - [12544, 8976, 1, 256, 12544, 12544, 256, 256] + - [12, 12007.0] + - - [1280, 8976, 1, 256, 1280, 1280, 256, 256] + - [20, 11429.0] + - - [13312, 8976, 1, 256, 13312, 13312, 256, 256] + - [12, 12009.0] + - - [13568, 8976, 1, 256, 13568, 13568, 256, 256] + - [12, 12012.0] + - - [13824, 8976, 1, 256, 13824, 13824, 256, 256] + - [12, 12018.0] + - - [15104, 8976, 1, 256, 15104, 15104, 256, 256] + - [20, 12024.0] + - - [15360, 8976, 1, 256, 15360, 15360, 256, 256] + - [12, 12027.0] + - - [15872, 8976, 1, 256, 15872, 15872, 256, 256] + - [20, 12023.0] + - - [16128, 8976, 1, 256, 16128, 16128, 256, 256] + - [12, 12026.0] + - - [17152, 8976, 1, 256, 17152, 17152, 256, 256] + - [20, 12028.0] + - - [1792, 8976, 1, 256, 1792, 1792, 256, 256] + - [20, 11667.0] + - - [18176, 8976, 1, 256, 18176, 18176, 256, 256] + - [20, 12031.0] + - - [18688, 8976, 1, 256, 18688, 18688, 256, 256] + - [20, 12034.0] + - - [18944, 8976, 1, 256, 18944, 18944, 256, 256] + - [20, 12033.0] + - - [19712, 8976, 1, 256, 19712, 19712, 256, 256] + - [20, 12031.0] + - - [19968, 8976, 1, 256, 19968, 19968, 256, 256] + - [12, 12033.0] + - - [20480, 8976, 1, 256, 20480, 20480, 256, 256] + - [12, 12027.0] + - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] + - [12, 10982.0] + - - [2048, 1536, 1, 768, 2048, 2048, 768, 768] + - [30, 11308.0] + - - [2048, 684, 1, 512, 2048, 2048, 512, 512] + - [28, 9188.0] + - - [2048, 684, 1, 768, 2048, 2048, 768, 768] + - [28, 9427.0] + - - [2048, 8976, 1, 256, 2048, 2048, 256, 256] + - [20, 11643.0] + - - [20992, 8976, 1, 256, 20992, 20992, 256, 256] + - [12, 12035.0] + - - [21248, 8976, 1, 256, 21248, 21248, 256, 256] + - [12, 12035.0] + - - [2304, 8976, 1, 256, 2304, 2304, 256, 256] + - [28, 11778.0] + - - [23552, 8976, 1, 256, 23552, 23552, 256, 256] + - [12, 12039.0] + - - [2560, 8976, 1, 256, 2560, 2560, 256, 256] + - [28, 11791.0] + - - [256, 10496, 1, 1024, 256, 256, 1024, 1024] + - [28, 10674.0] + - - [256, 11264, 1, 1024, 256, 256, 1024, 1024] + - [12, 11473.0] + - - [256, 11520, 1, 1024, 256, 256, 1024, 1024] + - [12, 11714.0] + - - [256, 11776, 1, 1024, 256, 256, 1024, 1024] + - [49, 11029.0] + - - [256, 12544, 1, 1024, 256, 256, 1024, 1024] + - [49, 11672.0] + - - [256, 13312, 1, 1024, 256, 256, 1024, 1024] + - [47, 11186.0] + - - [256, 14336, 1, 1024, 256, 256, 1024, 1024] + - [49, 11267.0] + - - [256, 14592, 1, 1024, 256, 256, 1024, 1024] + - [49, 11413.0] + - - [256, 14848, 1, 1024, 256, 256, 1024, 1024] + - [30, 11579.0] + - - [256, 15104, 1, 1024, 256, 256, 1024, 1024] + - [49, 11753.0] + - - [256, 16128, 1, 1024, 256, 256, 1024, 1024] + - [12, 11545.0] + - - [256, 18176, 1, 1024, 256, 256, 1024, 1024] + - [47, 11327.0] + - - [256, 18944, 1, 1024, 256, 256, 1024, 1024] + - [47, 11725.0] + - - [256, 19200, 1, 1024, 256, 256, 1024, 1024] + - [47, 11865.0] + - - [256, 20480, 1, 1024, 256, 256, 1024, 1024] + - [49, 11875.0] + - - [256, 20992, 1, 1024, 256, 256, 1024, 1024] + - [47, 11519.0] + - - [256, 21248, 1, 1024, 256, 256, 1024, 1024] + - [28, 11605.0] + - - [256, 21504, 1, 1024, 256, 256, 1024, 1024] + - [47, 11711.0] + - - [256, 22016, 1, 1024, 256, 256, 1024, 1024] + - [49, 11550.0] + - - [256, 22344, 1, 1024, 256, 256, 1024, 1024] + - [49, 11589.0] + - - [256, 23296, 1, 1024, 256, 256, 1024, 1024] + - [47, 11524.0] + - - [256, 23552, 1, 1024, 256, 256, 1024, 1024] + - [47, 11611.0] + - - [256, 31488, 1, 1024, 256, 256, 1024, 1024] + - [47, 11809.0] + - - [256, 33536, 1, 1024, 256, 256, 1024, 1024] + - [47, 11729.0] + - - [256, 44505, 1, 1024, 256, 256, 1024, 1024] + - [47, 11919.0] + - - [256, 4608, 1, 1024, 256, 256, 1024, 1024] + - [30, 10268.0] + - - [256, 4864, 1, 1024, 256, 256, 1024, 1024] + - [49, 10977.0] + - - [256, 5376, 1, 1024, 256, 256, 1024, 1024] + - [47, 9607.0] + - - [256, 5888, 1, 1024, 256, 256, 1024, 1024] + - [28, 10453.0] + - - [256, 6144, 1, 1024, 256, 256, 1024, 1024] + - [47, 10871.0] + - - [256, 6400, 1, 1024, 256, 256, 1024, 1024] + - [47, 11260.0] + - - [256, 6656, 1, 1024, 256, 256, 1024, 1024] + - [30, 10224.0] + - - [256, 7168, 1, 1024, 256, 256, 1024, 1024] + - [30, 10948.0] + - - [256, 7424, 1, 1024, 256, 256, 1024, 1024] + - [49, 11236.0] + - - [256, 7936, 1, 1024, 256, 256, 1024, 1024] + - [47, 10348.0] + - - [256, 8192, 1, 1024, 256, 256, 1024, 1024] + - [28, 10641.0] + - - [256, 8448, 1, 1024, 256, 256, 1024, 1024] + - [47, 10963.0] + - - [256, 8960, 1, 1024, 256, 256, 1024, 1024] + - [47, 11561.0] + - - [256, 9984, 1, 1024, 256, 256, 1024, 1024] + - [49, 11497.0] + - - [2816, 8976, 1, 256, 2816, 2816, 256, 256] + - [28, 11755.0] + - - [28672, 8976, 1, 256, 28672, 28672, 256, 256] + - [12, 12036.0] + - - [3072, 8976, 1, 256, 3072, 3072, 256, 256] + - [12, 11771.0] + - - [31488, 8976, 1, 256, 31488, 31488, 256, 256] + - [20, 12039.0] + - - [3328, 8976, 1, 256, 3328, 3328, 256, 256] + - [12, 11817.0] + - - [33536, 8976, 1, 256, 33536, 33536, 256, 256] + - [12, 12041.0] + - - [3840, 8976, 1, 256, 3840, 3840, 256, 256] + - [12, 11867.0] + - - [4096, 8976, 1, 256, 4096, 4096, 256, 256] + - [12, 11859.0] + - - [4352, 8976, 1, 256, 4352, 4352, 256, 256] + - [12, 11896.0] + - - [44505, 8976, 1, 256, 44505, 44505, 256, 256] + - [20, 12013.0] + - - [4608, 8976, 1, 256, 4608, 4608, 256, 256] + - [12, 11910.0] + - - [4864, 8976, 1, 256, 4864, 4864, 256, 256] + - [12, 11911.0] + - - [5120, 8976, 1, 256, 5120, 5120, 256, 256] + - [4, 11824.0] + - - [5376, 8976, 1, 256, 5376, 5376, 256, 256] + - [12, 11919.0] + - - [5632, 8976, 1, 256, 5632, 5632, 256, 256] + - [12, 11918.0] + - - [5888, 8976, 1, 256, 5888, 5888, 256, 256] + - [12, 11928.0] + - - [6144, 8976, 1, 256, 6144, 6144, 256, 256] + - [12, 11936.0] + - - [6400, 8976, 1, 256, 6400, 6400, 256, 256] + - [12, 11941.0] + - - [684, 8976, 1, 256, 684, 684, 256, 256] + - [20, 9814.0] + - - [7168, 8976, 1, 256, 7168, 7168, 256, 256] + - [12, 11965.0] + - - [7936, 8976, 1, 256, 7936, 7936, 256, 256] + - [12, 11971.0] + - - [8192, 8976, 1, 256, 8192, 8192, 256, 256] + - [12, 11950.0] + - - [8448, 8976, 1, 256, 8448, 8448, 256, 256] + - [12, 11976.0] + - - [8960, 8976, 1, 256, 8960, 8960, 256, 256] + - [12, 11982.0] + - - [9472, 8976, 1, 256, 9472, 9472, 256, 256] + - [12, 11987.0] + - - [9728, 8976, 1, 256, 9728, 9728, 256, 256] + - [12, 11993.0] + - - [9984, 8976, 1, 256, 9984, 9984, 256, 256] + - [12, 11987.0] + - - [512, 32768, 1, 13, 512, 512, 13, 13] + - [1, 3272.0] + - - [256, 32768, 1, 512, 256, 256, 512, 512] + - [47, 11757.0] + - - [128, 32768, 1, 512, 128, 128, 512, 512] + - [28, 11450.0] + - - [1024, 32768, 1, 479, 1024, 1024, 479, 479] + - [25, 12199.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 12151.0] + - - [512, 32768, 1, 1024, 512, 512, 1024, 1024] + - [30, 12077.0] + - - [1023, 2048, 1, 4096, 1023, 1023, 4096, 4096] + - [12, 10562.0] + - - [1025, 2048, 1, 4096, 1025, 1025, 4096, 4096] + - [14, 9835.0] + - - [1024, 2047, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 10485.0] + - - [1024, 2049, 1, 4096, 1024, 1024, 4096, 4096] + - [12, 10429.0] + - - [1024, 2048, 1, 4095, 1024, 1024, 4095, 4095] + - [17, 11107.0] + - - [1024, 2048, 1, 4097, 1024, 1024, 4097, 4097] + - [17, 11096.0] + - - [1023, 3072, 1, 1024, 1023, 1023, 1024, 1024] + - [14, 11287.0] + - - [1025, 3072, 1, 1024, 1025, 1025, 1024, 1024] + - [47, 10249.0] + - - [1024, 3071, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 11226.0] + - - [1024, 3073, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11138.0] + - - [1024, 3072, 1, 1023, 1024, 1024, 1023, 1023] + - [1, 11544.0] + - - [1024, 3072, 1, 1025, 1024, 1024, 1025, 1025] + - [17, 11503.0] + - - [3071, 512, 1, 1024, 3071, 3071, 1024, 1024] + - [47, 10575.0] + - - [3073, 512, 1, 1024, 3073, 3073, 1024, 1024] + - [47, 10521.0] + - - [3072, 511, 1, 1024, 3072, 3072, 1024, 1024] + - [28, 10526.0] + - - [3072, 513, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9012.0] + - - [3072, 512, 1, 1023, 3072, 3072, 1023, 1023] + - [35, 11152.0] + - - [3072, 512, 1, 1025, 3072, 3072, 1025, 1025] + - [35, 11039.0] + - - [128, 32768, 1, 256, 128, 128, 256, 256] + - [12, 11272.0] + - - [1024, 4096, 1, 480, 1024, 1024, 480, 480] + - [1, 11848.0] + - - [512, 4096, 1, 1024, 512, 512, 1024, 1024] + - [28, 10609.0] + - - [512, 55296, 1, 13, 512, 512, 13, 13] + - [0, 3555.0] + - - [256, 55296, 1, 512, 256, 256, 512, 512] + - [47, 11877.0] + - - [128, 55296, 1, 256, 128, 128, 256, 256] + - [28, 11390.0] + - - [1024, 6912, 1, 480, 1024, 1024, 480, 480] + - [1, 12010.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 11858.0] + - - [512, 6912, 1, 1024, 512, 512, 1024, 1024] + - [28, 11548.0] + - - [256, 6912, 1, 512, 256, 256, 512, 512] + - [30, 10198.0] + - - [1151, 1152, 1, 1152, 1151, 1151, 1152, 1152] + - [20, 9185.0] + - - [1153, 1152, 1, 1152, 1153, 1153, 1152, 1152] + - [44, 9309.0] + - - [1152, 1151, 1, 1152, 1152, 1152, 1152, 1152] + - [4, 9213.0] + - - [1152, 1153, 1, 1152, 1152, 1152, 1152, 1152] + - [44, 9199.0] + - - [1152, 1152, 1, 1151, 1152, 1152, 1151, 1151] + - [16, 9698.0] + - - [1152, 1152, 1, 1153, 1152, 1152, 1153, 1153] + - [16, 9572.0] + - - [1535, 1536, 1, 1536, 1535, 1535, 1536, 1536] + - [14, 10768.0] + - - [1537, 1536, 1, 1536, 1537, 1537, 1536, 1536] + - [49, 10799.0] + - - [1536, 1535, 1, 1536, 1536, 1536, 1536, 1536] + - [30, 10748.0] + - - [1536, 1537, 1, 1536, 1536, 1536, 1536, 1536] + - [28, 10533.0] + - - [1536, 1536, 1, 1535, 1536, 1536, 1535, 1535] + - [17, 10917.0] + - - [1536, 1536, 1, 1537, 1536, 1536, 1537, 1537] + - [22, 10925.0] + - - [1919, 1920, 1, 1920, 1919, 1919, 1920, 1920] + - [25, 11288.0] + - - [1921, 1920, 1, 1920, 1921, 1921, 1920, 1920] + - [38, 11186.0] + - - [1920, 1919, 1, 1920, 1920, 1920, 1920, 1920] + - [38, 11215.0] + - - [1920, 1921, 1, 1920, 1920, 1920, 1920, 1920] + - [25, 11242.0] + - - [1920, 1920, 1, 1919, 1920, 1920, 1919, 1919] + - [17, 11551.0] + - - [1920, 1920, 1, 1921, 1920, 1920, 1921, 1921] + - [35, 11529.0] + - - [2303, 2304, 1, 2304, 2303, 2303, 2304, 2304] + - [28, 11532.0] + - - [2305, 2304, 1, 2304, 2305, 2305, 2304, 2304] + - [30, 11133.0] + - - [2304, 2303, 1, 2304, 2304, 2304, 2304, 2304] + - [12, 11532.0] + - - [2304, 2305, 1, 2304, 2304, 2304, 2304, 2304] + - [30, 11122.0] + - - [2304, 2304, 1, 2303, 2304, 2304, 2303, 2303] + - [35, 11818.0] + - - [2304, 2304, 1, 2305, 2304, 2304, 2305, 2305] + - [35, 11781.0] + - - [2687, 2688, 1, 2688, 2687, 2687, 2688, 2688] + - [38, 11695.0] + - - [2689, 2688, 1, 2688, 2689, 2689, 2688, 2688] + - [1, 11244.0] + - - [2688, 2687, 1, 2688, 2688, 2688, 2688, 2688] + - [4, 11709.0] + - - [2688, 2689, 1, 2688, 2688, 2688, 2688, 2688] + - [22, 11453.0] + - - [2688, 2688, 1, 2687, 2688, 2688, 2687, 2687] + - [35, 11965.0] + - - [2688, 2688, 1, 2689, 2688, 2688, 2689, 2689] + - [17, 11908.0] + - - [3455, 3456, 1, 3456, 3455, 3455, 3456, 3456] + - [20, 12068.0] + - - [3457, 3456, 1, 3456, 3457, 3457, 3456, 3456] + - [20, 11738.0] + - - [3456, 3455, 1, 3456, 3456, 3456, 3456, 3456] + - [4, 12069.0] + - - [3456, 3457, 1, 3456, 3456, 3456, 3456, 3456] + - [22, 11975.0] + - - [3456, 3456, 1, 3455, 3456, 3456, 3455, 3455] + - [17, 12335.0] + - - [3456, 3456, 1, 3457, 3456, 3456, 3457, 3457] + - [17, 12314.0] + - - [3839, 3840, 1, 3840, 3839, 3839, 3840, 3840] + - [28, 12209.0] + - - [3841, 3840, 1, 3840, 3841, 3841, 3840, 3840] + - [28, 11738.0] + - - [3840, 3839, 1, 3840, 3840, 3840, 3840, 3840] + - [28, 12225.0] + - - [3840, 3841, 1, 3840, 3840, 3840, 3840, 3840] + - [12, 11753.0] + - - [3840, 3840, 1, 3839, 3840, 3840, 3839, 3839] + - [38, 12221.0] + - - [3840, 3840, 1, 3841, 3840, 3840, 3841, 3841] + - [20, 12215.0] + - - [4223, 4224, 1, 4224, 4223, 4223, 4224, 4224] + - [38, 12171.0] + - - [4225, 4224, 1, 4224, 4225, 4225, 4224, 4224] + - [28, 11768.0] + - - [4224, 4223, 1, 4224, 4224, 4224, 4224, 4224] + - [20, 12176.0] + - - [4224, 4225, 1, 4224, 4224, 4224, 4224, 4224] + - [6, 11809.0] + - - [4224, 4224, 1, 4223, 4224, 4224, 4223, 4223] + - [17, 12393.0] + - - [4224, 4224, 1, 4225, 4224, 4224, 4225, 4225] + - [17, 12350.0] + - - [4607, 4608, 1, 4608, 4607, 4607, 4608, 4608] + - [28, 12243.0] + - - [4609, 4608, 1, 4608, 4609, 4609, 4608, 4608] + - [49, 11939.0] + - - [4608, 4607, 1, 4608, 4608, 4608, 4608, 4608] + - [47, 12253.0] + - - [4608, 4609, 1, 4608, 4608, 4608, 4608, 4608] + - [28, 11915.0] + - - [4608, 4608, 1, 4607, 4608, 4608, 4607, 4607] + - [20, 12250.0] + - - [4608, 4608, 1, 4609, 4608, 4608, 4609, 4609] + - [22, 12252.0] + - - [4991, 4992, 1, 4992, 4991, 4991, 4992, 4992] + - [20, 12198.0] + - - [4993, 4992, 1, 4992, 4993, 4993, 4992, 4992] + - [20, 12014.0] + - - [4992, 4991, 1, 4992, 4992, 4992, 4992, 4992] + - [12, 12191.0] + - - [4992, 4993, 1, 4992, 4992, 4992, 4992, 4992] + - [40, 12040.0] + - - [4992, 4992, 1, 4991, 4992, 4992, 4991, 4991] + - [35, 12369.0] + - - [4992, 4992, 1, 4993, 4992, 4992, 4993, 4993] + - [35, 12350.0] + - - [5375, 5376, 1, 5376, 5375, 5375, 5376, 5376] + - [40, 12293.0] + - - [5377, 5376, 1, 5376, 5377, 5377, 5376, 5376] + - [40, 12036.0] + - - [5376, 5375, 1, 5376, 5376, 5376, 5376, 5376] + - [22, 12304.0] + - - [5376, 5377, 1, 5376, 5376, 5376, 5376, 5376] + - [12, 11954.0] + - - [5376, 5376, 1, 5375, 5376, 5376, 5375, 5375] + - [6, 12298.0] + - - [5376, 5376, 1, 5377, 5376, 5376, 5377, 5377] + - [40, 12297.0] + - - [5759, 5760, 1, 5760, 5759, 5759, 5760, 5760] + - [38, 12246.0] + - - [5761, 5760, 1, 5760, 5761, 5761, 5760, 5760] + - [28, 11994.0] + - - [5760, 5759, 1, 5760, 5760, 5760, 5760, 5760] + - [20, 12249.0] + - - [5760, 5761, 1, 5760, 5760, 5760, 5760, 5760] + - [30, 11396.0] + - - [5760, 5760, 1, 5759, 5760, 5760, 5759, 5759] + - [22, 11704.0] + - - [5760, 5760, 1, 5761, 5760, 5760, 5761, 5761] + - [22, 11668.0] + - - [6143, 6144, 1, 6144, 6143, 6143, 6144, 6144] + - [28, 10904.0] + - - [6145, 6144, 1, 6144, 6145, 6145, 6144, 6144] + - [14, 10638.0] + - - [6144, 6143, 1, 6144, 6144, 6144, 6144, 6144] + - [14, 10705.0] + - - [6144, 6145, 1, 6144, 6144, 6144, 6144, 6144] + - [14, 10296.0] + - - [6144, 6144, 1, 6143, 6144, 6144, 6143, 6143] + - [40, 11840.0] + - - [6144, 6144, 1, 6145, 6144, 6144, 6145, 6145] + - [22, 11844.0] + - - [6527, 6528, 1, 6528, 6527, 6527, 6528, 6528] + - [38, 11638.0] + - - [6529, 6528, 1, 6528, 6529, 6529, 6528, 6528] + - [38, 11491.0] + - - [6528, 6527, 1, 6528, 6528, 6528, 6528, 6528] + - [38, 11737.0] + - - [6528, 6529, 1, 6528, 6528, 6528, 6528, 6528] + - [49, 11617.0] + - - [6528, 6528, 1, 6527, 6528, 6528, 6527, 6527] + - [38, 11724.0] + - - [6528, 6528, 1, 6529, 6528, 6528, 6529, 6529] + - [38, 11764.0] + - - [6911, 6912, 1, 6912, 6911, 6911, 6912, 6912] + - [49, 11830.0] + - - [6913, 6912, 1, 6912, 6913, 6913, 6912, 6912] + - [47, 11617.0] + - - [6912, 6911, 1, 6912, 6912, 6912, 6912, 6912] + - [30, 11825.0] + - - [6912, 6913, 1, 6912, 6912, 6912, 6912, 6912] + - [47, 11606.0] + - - [6912, 6912, 1, 6911, 6912, 6912, 6911, 6911] + - [40, 12032.0] + - - [6912, 6912, 1, 6913, 6912, 6912, 6913, 6913] + - [22, 12122.0] + - - [7295, 7296, 1, 7296, 7295, 7295, 7296, 7296] + - [38, 11877.0] + - - [7297, 7296, 1, 7296, 7297, 7297, 7296, 7296] + - [47, 11726.0] + - - [7296, 7295, 1, 7296, 7296, 7296, 7296, 7296] + - [38, 11935.0] + - - [7296, 7297, 1, 7296, 7296, 7296, 7296, 7296] + - [49, 11787.0] + - - [7296, 7296, 1, 7295, 7296, 7296, 7295, 7295] + - [38, 11930.0] + - - [7296, 7296, 1, 7297, 7296, 7296, 7297, 7297] + - [47, 11932.0] + - - [7679, 7680, 1, 7680, 7679, 7679, 7680, 7680] + - [49, 11565.0] + - - [7681, 7680, 1, 7680, 7681, 7681, 7680, 7680] + - [49, 11627.0] + - - [7680, 7679, 1, 7680, 7680, 7680, 7680, 7680] + - [49, 11754.0] + - - [7680, 7681, 1, 7680, 7680, 7680, 7680, 7680] + - [49, 11086.0] + - - [7680, 7680, 1, 7679, 7680, 7680, 7679, 7679] + - [40, 12111.0] + - - [7680, 7680, 1, 7681, 7680, 7680, 7681, 7681] + - [40, 12105.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [9, 8746.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [30, 10583.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [38, 11106.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [28, 11413.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [38, 11211.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [20, 10746.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [47, 10557.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [20, 11382.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [28, 10952.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [47, 11448.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [49, 11433.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [47, 11548.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [14, 10534.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [28, 11685.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [49, 11818.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [47, 11905.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [49, 11790.0] + - - [256, 128, 49, 1152, 256, 256, 1152, 1152] + - [15, 4540.0] + - - [256, 128, 121, 120, 256, 256, 120, 120] + - [18, 6789.0] + - - [256, 128, 169, 120, 256, 256, 120, 120] + - [39, 8315.0] + - - [256, 128, 36, 120, 256, 256, 120, 120] + - [2, 6548.0] + - - [256, 128, 49, 120, 256, 256, 120, 120] + - [16, 7084.0] + - - [256, 128, 64, 120, 256, 256, 120, 120] + - [18, 7432.0] + - - [256, 128, 36, 12000, 256, 256, 12000, 12000] + - [28, 9721.0] + - - [256, 128, 49, 1216, 256, 256, 1216, 1216] + - [4, 10804.0] + - - [256, 128, 121, 18, 256, 256, 18, 18] + - [0, 2659.0] + - - [256, 128, 169, 18, 256, 256, 18, 18] + - [1, 3067.0] + - - [256, 128, 36, 18, 256, 256, 18, 18] + - [13, 2628.0] + - - [256, 128, 49, 18, 256, 256, 18, 18] + - [7, 2294.0] + - - [256, 128, 64, 18, 256, 256, 18, 18] + - [3, 2692.0] + - - [256, 128, 36, 1800, 256, 256, 1800, 1800] + - [38, 9913.0] + - - [256, 128, 121, 19, 256, 256, 19, 19] + - [0, 2954.0] + - - [256, 128, 169, 19, 256, 256, 19, 19] + - [1, 2954.0] + - - [256, 128, 36, 19, 256, 256, 19, 19] + - [0, 2410.0] + - - [256, 128, 49, 19, 256, 256, 19, 19] + - [0, 2354.0] + - - [256, 128, 64, 19, 256, 256, 19, 19] + - [3, 2689.0] + - - [256, 128, 36, 1900, 256, 256, 1900, 1900] + - [38, 9923.0] + - - [256, 128, 49, 480, 256, 256, 480, 480] + - [2, 10117.0] + - - [256, 128, 81, 480, 256, 256, 480, 480] + - [28, 9566.0] + - - [256, 128, 64, 5880, 256, 256, 5880, 5880] + - [20, 9565.0] + - - [256, 128, 49, 72, 256, 256, 72, 72] + - [2, 5959.0] + - - [256, 128, 81, 72, 256, 256, 72, 72] + - [16, 6055.0] + - - [256, 128, 49, 76, 256, 256, 76, 76] + - [5, 5507.0] + - - [256, 128, 81, 76, 256, 256, 76, 76] + - [39, 6245.0] + - - [256, 128, 49, 7680, 256, 256, 7680, 7680] + - [33, 3462.0] + - - [256, 128, 64, 882, 256, 256, 882, 882] + - [47, 9908.0] + - - [256, 128, 64, 931, 256, 256, 931, 931] + - [20, 9873.0] + - - [256, 256, 49, 1152, 256, 256, 1152, 1152] + - [30, 10911.0] + - - [256, 256, 36, 12000, 256, 256, 12000, 12000] + - [0, 7799.0] + - - [256, 256, 49, 1216, 256, 256, 1216, 1216] + - [40, 11604.0] + - - [256, 256, 36, 1800, 256, 256, 1800, 1800] + - [40, 10772.0] + - - [256, 256, 36, 1900, 256, 256, 1900, 1900] + - [22, 10778.0] + - - [256, 256, 64, 5880, 256, 256, 5880, 5880] + - [0, 10003.0] + - - [256, 256, 49, 7680, 256, 256, 7680, 7680] + - [32, 5447.0] + - - [256, 256, 64, 882, 256, 256, 882, 882] + - [38, 11390.0] + - - [256, 256, 64, 931, 256, 256, 931, 931] + - [4, 11395.0] + - - [340, 256, 49, 1152, 340, 340, 1152, 1152] + - [20, 10199.0] + - - [340, 256, 36, 120, 340, 340, 120, 120] + - [2, 8728.0] + - - [340, 256, 49, 120, 340, 340, 120, 120] + - [17, 8916.0] + - - [340, 256, 64, 120, 340, 340, 120, 120] + - [16, 8970.0] + - - [340, 256, 36, 12000, 340, 340, 12000, 12000] + - [35, 7576.0] + - - [340, 256, 49, 1216, 340, 340, 1216, 1216] + - [35, 10425.0] + - - [340, 256, 36, 18, 340, 340, 18, 18] + - [5, 2910.0] + - - [340, 256, 49, 18, 340, 340, 18, 18] + - [39, 2973.0] + - - [340, 256, 64, 18, 340, 340, 18, 18] + - [48, 3051.0] + - - [340, 256, 36, 1800, 340, 340, 1800, 1800] + - [35, 10417.0] + - - [340, 256, 36, 19, 340, 340, 19, 19] + - [18, 3063.0] + - - [340, 256, 49, 19, 340, 340, 19, 19] + - [45, 3551.0] + - - [340, 256, 64, 19, 340, 340, 19, 19] + - [26, 3665.0] + - - [340, 256, 36, 1900, 340, 340, 1900, 1900] + - [35, 10364.0] + - - [340, 256, 64, 5880, 340, 340, 5880, 5880] + - [0, 9228.0] + - - [340, 256, 49, 7680, 340, 340, 7680, 7680] + - [15, 6095.0] + - - [340, 256, 64, 882, 340, 340, 882, 882] + - [40, 10167.0] + - - [340, 256, 64, 931, 340, 340, 931, 931] + - [40, 10169.0] + - - [510, 256, 49, 120, 510, 510, 120, 120] + - [18, 9825.0] + - - [510, 256, 64, 120, 510, 510, 120, 120] + - [45, 10124.0] + - - [510, 256, 49, 18, 510, 510, 18, 18] + - [18, 3475.0] + - - [510, 256, 64, 18, 510, 510, 18, 18] + - [34, 3544.0] + - - [510, 256, 49, 19, 510, 510, 19, 19] + - [43, 3611.0] + - - [510, 256, 64, 19, 510, 510, 19, 19] + - [39, 3711.0] + - - [510, 256, 36, 480, 510, 510, 480, 480] + - [1, 11276.0] + - - [510, 256, 36, 72, 510, 510, 72, 72] + - [16, 7287.0] + - - [510, 256, 36, 76, 510, 510, 76, 76] + - [21, 7489.0] + - - [510, 512, 36, 1080, 510, 510, 1080, 1080] + - [17, 12076.0] + - - [510, 512, 36, 162, 510, 510, 162, 162] + - [17, 10299.0] + - - [510, 512, 36, 171, 510, 510, 171, 171] + - [17, 10658.0] + - - [510, 512, 49, 1920, 510, 510, 1920, 1920] + - [49, 10404.0] + - - [510, 512, 64, 1920, 510, 510, 1920, 1920] + - [12, 8105.0] + - - [510, 512, 49, 288, 510, 510, 288, 288] + - [6, 2843.0] + - - [510, 512, 64, 288, 510, 510, 288, 288] + - [17, 11733.0] + - - [510, 512, 36, 3000, 510, 510, 3000, 3000] + - [4, 10740.0] + - - [510, 512, 49, 304, 510, 510, 304, 304] + - [35, 11406.0] + - - [510, 512, 64, 304, 510, 510, 304, 304] + - [17, 11742.0] + - - [510, 512, 36, 450, 510, 510, 450, 450] + - [4, 11543.0] + - - [510, 512, 36, 475, 510, 510, 475, 475] + - [38, 11579.0] + - - [510, 512, 49, 480, 510, 510, 480, 480] + - [17, 11859.0] + - - [510, 512, 64, 480, 510, 510, 480, 480] + - [35, 11958.0] + - - [510, 512, 49, 72, 510, 510, 72, 72] + - [18, 8946.0] + - - [510, 512, 64, 72, 510, 510, 72, 72] + - [36, 9051.0] + - - [510, 512, 49, 76, 510, 510, 76, 76] + - [21, 8892.0] + - - [510, 512, 64, 76, 510, 510, 76, 76] + - [29, 9062.0] + - - [512, 256, 81, 1080, 512, 512, 1080, 1080] + - [4, 11811.0] + - - [512, 256, 25, 12000, 512, 512, 12000, 12000] + - [0, 8896.0] + - - [512, 256, 81, 162, 512, 512, 162, 162] + - [4, 10447.0] + - - [512, 256, 81, 171, 512, 512, 171, 171] + - [4, 10604.0] + - - [512, 256, 25, 1800, 512, 512, 1800, 1800] + - [22, 11981.0] + - - [512, 256, 25, 1900, 512, 512, 1900, 1900] + - [40, 12019.0] + - - [512, 256, 121, 1920, 512, 512, 1920, 1920] + - [12, 7556.0] + - - [512, 256, 169, 1920, 512, 512, 1920, 1920] + - [40, 8359.0] + - - [512, 256, 49, 1920, 512, 512, 1920, 1920] + - [38, 11209.0] + - - [512, 256, 121, 288, 512, 512, 288, 288] + - [4, 11729.0] + - - [512, 256, 169, 288, 512, 512, 288, 288] + - [4, 11939.0] + - - [512, 256, 49, 288, 512, 512, 288, 288] + - [1, 11566.0] + - - [512, 256, 25, 3000, 512, 512, 3000, 3000] + - [40, 12154.0] + - - [512, 256, 81, 3000, 512, 512, 3000, 3000] + - [0, 9645.0] + - - [512, 256, 121, 304, 512, 512, 304, 304] + - [38, 11758.0] + - - [512, 256, 169, 304, 512, 512, 304, 304] + - [4, 11954.0] + - - [512, 256, 49, 304, 512, 512, 304, 304] + - [1, 11572.0] + - - [512, 256, 25, 450, 512, 512, 450, 450] + - [1, 11565.0] + - - [512, 256, 81, 450, 512, 512, 450, 450] + - [20, 11516.0] + - - [512, 256, 25, 475, 512, 512, 475, 475] + - [1, 11502.0] + - - [512, 256, 81, 475, 512, 512, 475, 475] + - [4, 11620.0] + - - [512, 256, 121, 480, 512, 512, 480, 480] + - [1, 12015.0] + - - [512, 256, 169, 480, 512, 512, 480, 480] + - [1, 12156.0] + - - [512, 256, 49, 5880, 512, 512, 5880, 5880] + - [0, 9300.0] + - - [512, 256, 121, 72, 512, 512, 72, 72] + - [39, 8486.0] + - - [512, 256, 169, 72, 512, 512, 72, 72] + - [0, 6521.0] + - - [512, 256, 121, 76, 512, 512, 76, 76] + - [21, 9309.0] + - - [512, 256, 169, 76, 512, 512, 76, 76] + - [37, 7820.0] + - - [512, 256, 49, 882, 512, 512, 882, 882] + - [22, 11633.0] + - - [512, 256, 49, 931, 512, 512, 931, 931] + - [22, 11667.0] + - - [2304, 512, 1, 100, 2304, 2304, 100, 100] + - [2, 7792.0] + - - [2304, 512, 1, 361, 2304, 2304, 361, 361] + - [35, 9250.0] + - - [4608, 510, 1, 100, 4608, 4608, 100, 100] + - [43, 6896.0] + - - [4608, 510, 1, 361, 4608, 4608, 361, 361] + - [35, 9718.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [14, 10744.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [28, 7840.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [12, 11509.0] + - - [30522, 616, 1, 1024, 30522, 30522, 1024, 1024] + - [47, 5852.0] + - - [128, 128, 128, 64, 128, 128, 64, 64] + - [16, 4269.0] + - - [128, 128, 160, 64, 128, 128, 64, 64] + - [29, 6498.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [14, 10912.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 11071.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11538.0] + - - [30522, 200, 1, 1024, 30522, 30522, 1024, 1024] + - [49, 5769.0] + - - [128, 128, 624, 64, 128, 128, 64, 64] + - [48, 7094.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11091.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 7424.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 1024, 1024] + - [14, 11349.0] + - - [30522, 780, 1, 1024, 30522, 30522, 1024, 1024] + - [47, 6665.0] + - - [30522, 308, 1, 1024, 30522, 30522, 1024, 1024] + - [14, 5530.0] + - - [128, 128, 640, 64, 128, 128, 64, 64] + - [48, 7228.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11216.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 7379.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8441.0] + - - [30522, 800, 1, 1024, 30522, 30522, 1024, 1024] + - [47, 6707.0] + - - [128, 128, 656, 64, 128, 128, 64, 64] + - [29, 8266.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 11060.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 7495.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8579.0] + - - [30522, 820, 1, 1024, 30522, 30522, 1024, 1024] + - [47, 6888.0] + - - [512, 512, 80, 64, 512, 512, 64, 64] + - [23, 6549.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [30, 10590.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 12028.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 11763.0] + - - [30522, 385, 1, 1024, 30522, 30522, 1024, 1024] + - [49, 4414.0] + - - [30522, 462, 1, 1024, 30522, 30522, 1024, 1024] + - [30, 4917.0] + - - [128, 128, 144, 64, 128, 128, 64, 64] + - [34, 4392.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [49, 9610.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9848.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10872.0] + - - [30522, 180, 1, 1024, 30522, 30522, 1024, 1024] + - [48, 5727.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 11317.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 8388.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10074.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 8509.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9548.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9719.0] + - - [33712, 8192, 1, 1024, 33712, 33712, 1024, 1024] + - [47, 11591.0] + - - [33712, 9600, 1, 1024, 33712, 33712, 1024, 1024] + - [30, 11565.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9567.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8578.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9479.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 4096, 4096] + - [47, 8318.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 11455.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 7399.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 10762.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 7624.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10967.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8040.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10928.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8556.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9500.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10053.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8287.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8760.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9317.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9627.0] + - - [42720, 10080, 1, 1024, 42720, 42720, 1024, 1024] + - [49, 11810.0] + - - [42720, 6528, 1, 1024, 42720, 42720, 1024, 1024] + - [47, 11501.0] + - - [42720, 7104, 1, 1024, 42720, 42720, 1024, 1024] + - [49, 11668.0] + - - [1024, 32768, 1, 480, 1024, 1024, 480, 480] + - [20, 7219.0] + - - [30592, 1024, 1, 2048, 30592, 30592, 2048, 2048] + - [30, 9708.0] + - - [6144, 1024, 1, 2048, 6144, 6144, 2048, 2048] + - [12, 10130.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 2048, 2048] + - [12, 6894.0] + - - [30592, 8192, 1, 1024, 30592, 30592, 1024, 1024] + - [49, 11710.0] + - - [3072, 8192, 1, 1024, 3072, 3072, 1024, 1024] + - [22, 9910.0] + - - [512, 512, 256, 64, 512, 512, 64, 64] + - [27, 7413.0] + - - [30592, 2048, 1, 1024, 30592, 30592, 1024, 1024] + - [49, 10092.0] + - - [30592, 4096, 1, 1024, 30592, 30592, 1024, 1024] + - [49, 10991.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 11546.0] + - - [1920, 2048, 1, 2560, 1920, 1920, 2560, 2560] + - [12, 11709.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [30, 12076.0] + - - [2560, 2048, 1, 640, 2560, 2560, 640, 640] + - [38, 11334.0] + - - [7680, 2048, 1, 2560, 7680, 7680, 2560, 2560] + - [28, 9607.0] + - - [512, 512, 40, 64, 512, 512, 64, 64] + - [7, 8218.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [30, 11591.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 6144, 6144] + - [26, 8218.0] + - - [4608, 4096, 1, 1536, 4608, 4608, 1536, 1536] + - [12, 9822.0] + - - [50304, 4096, 1, 1536, 50304, 50304, 1536, 1536] + - [49, 11876.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 1536, 1536] + - [12, 10113.0] + - - [1024, 1024, 64, 96, 1024, 1024, 96, 96] + - [12, 11015.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [12, 8413.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 6144, 6144] + - [28, 10121.0] + - - [4608, 8192, 1, 1536, 4608, 4608, 1536, 1536] + - [28, 10909.0] + - - [50304, 8192, 1, 1536, 50304, 50304, 1536, 1536] + - [49, 12153.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 1536, 1536] + - [12, 10986.0] + - - [1024, 1024, 128, 96, 1024, 1024, 96, 96] + - [20, 8086.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [28, 6855.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 4096, 4096] + - [30, 9473.0] + - - [3072, 16384, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9984.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10716.0] + - - [50304, 16384, 1, 1024, 50304, 50304, 1024, 1024] + - [49, 12113.0] + - - [1024, 1024, 256, 64, 1024, 1024, 64, 64] + - [13, 4321.0] + - - [50304, 2048, 1, 1024, 50304, 50304, 1024, 1024] + - [49, 11180.0] + - - [1024, 1024, 32, 64, 1024, 1024, 64, 64] + - [5, 8216.0] + - - [50304, 4096, 1, 1024, 50304, 50304, 1024, 1024] + - [49, 11597.0] + - - [1024, 1024, 64, 64, 1024, 1024, 64, 64] + - [11, 8412.0] + - - [50304, 8192, 1, 1024, 50304, 50304, 1024, 1024] + - [49, 11938.0] + - - [1024, 1024, 128, 64, 1024, 1024, 64, 64] + - [19, 6115.0] + - - [30528, 8192, 1, 1024, 30528, 30528, 1024, 1024] + - [49, 11581.0] + - - [128, 128, 1024, 64, 128, 128, 64, 64] + - [37, 5410.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 11204.0] + - - [1024, 3456, 1, 480, 1024, 1024, 480, 480] + - [4, 11038.0] + - - [512, 3456, 1, 1024, 512, 512, 1024, 1024] + - [30, 9922.0] + - - [512, 3456, 1, 13, 512, 512, 13, 13] + - [0, 1743.0] + - - [512, 4096, 1, 13, 512, 512, 13, 13] + - [36, 2443.0] + - - [512, 6912, 1, 13, 512, 512, 13, 13] + - [39, 2576.0] + - - [30528, 640, 1, 1024, 30528, 30528, 1024, 1024] + - [48, 6268.0] + - - [30528, 1280, 1, 1024, 30528, 30528, 1024, 1024] + - [47, 7946.0] + - - [30528, 1600, 1, 1024, 30528, 30528, 1024, 1024] + - [49, 8918.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9513.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9711.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8720.0] + - - [128, 128, 1280, 64, 128, 128, 64, 64] + - [37, 5139.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8968.0] + - - [30528, 1640, 1, 1024, 30528, 30528, 1024, 1024] + - [47, 9039.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 10142.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9408.0] + - - [128, 128, 1312, 64, 128, 128, 64, 64] + - [37, 5153.0] + - - [30528, 160, 1, 1024, 30528, 30528, 1024, 1024] + - [30, 7028.0] + - - [30528, 240, 1, 1024, 30528, 30528, 1024, 1024] + - [49, 9115.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 10963.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 8913.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 7521.0] + - - [512, 512, 192, 64, 512, 512, 64, 64] + - [42, 7459.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9328.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9652.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9490.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8728.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9862.0] + - - [3072, 10224, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9163.0] + - - [3072, 10240, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9421.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8451.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9849.0] + - - [3072, 10192, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9069.0] + - - [3072, 10200, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9108.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [12, 9554.0] + - - [3072, 10208, 1, 1024, 3072, 3072, 1024, 1024] + - [12, 9203.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 4096, 4096] + - [28, 8709.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9978.0] + - - [2048, 10224, 1, 1024, 2048, 2048, 1024, 1024] + - [12, 7953.0] + - - [2048, 10240, 1, 1024, 2048, 2048, 1024, 1024] + - [12, 7875.0] + - - [4096, 256, 1, 12288, 4096, 4096, 12288, 12288] + - [48, 3969.0] + - - [2048, 256, 1, 13312, 2048, 2048, 13312, 13312] + - [62, 5934.0] + - - [4096, 256, 1, 15360, 4096, 4096, 15360, 15360] + - [28, 4472.0] + - - [2048, 512, 1, 16640, 2048, 2048, 16640, 16640] + - [69, 5988.0] + - - [4096, 256, 1, 14336, 4096, 4096, 14336, 14336] + - [47, 4298.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 8192, 8192] + - [56, 9774.0] + - - [1024, 512, 1, 16384, 1024, 1024, 16384, 16384] + - [32, 4969.0] + - - [4096, 256, 1, 9216, 4096, 4096, 9216, 9216] + - [33, 3694.0] + - - [1024, 512, 1, 12288, 1024, 1024, 12288, 12288] + - [55, 9723.0] + - - [4096, 200, 1, 12288, 4096, 4096, 12288, 12288] + - [44, 4537.0] + - - [1024, 1024, 1, 13312, 1024, 1024, 13312, 13312] + - [63, 6248.0] + - - [2048, 256, 1, 16384, 2048, 2048, 16384, 16384] + - [56, 5603.0] + - - [2048, 512, 1, 16384, 2048, 2048, 16384, 16384] + - [70, 5162.0] + - - [1024, 1024, 1, 8320, 1024, 1024, 8320, 8320] + - [52, 10791.0] + - - [2048, 256, 1, 14336, 2048, 2048, 14336, 14336] + - [55, 7215.0] + - - [4096, 200, 1, 16640, 4096, 4096, 16640, 16640] + - [48, 6027.0] + - - [1024, 1024, 1, 16640, 1024, 1024, 16640, 16640] + - [57, 7091.0] + - - [1024, 1024, 1, 14336, 1024, 1024, 14336, 14336] + - [53, 5834.0] + - - [2048, 512, 1, 9216, 2048, 2048, 9216, 9216] + - [13, 6621.0] + - - [1024, 1024, 1, 15360, 1024, 1024, 15360, 15360] + - [58, 6024.0] + - - [2048, 512, 1, 8192, 2048, 2048, 8192, 8192] + - [50, 6598.0] + - - [2048, 512, 1, 13312, 2048, 2048, 13312, 13312] + - [61, 4901.0] + - - [1024, 1024, 1, 11264, 1024, 1024, 11264, 11264] + - [47, 6774.0] + - - [1024, 512, 1, 16640, 1024, 1024, 16640, 16640] + - [32, 6673.0] + - - [2048, 512, 1, 10240, 2048, 2048, 10240, 10240] + - [57, 5022.0] + - - [2048, 256, 1, 16640, 2048, 2048, 16640, 16640] + - [57, 8392.0] + - - [4096, 256, 1, 13312, 4096, 4096, 13312, 13312] + - [51, 3986.0] + - - [4096, 200, 1, 15360, 4096, 4096, 15360, 15360] + - [48, 3453.0] + - - [2048, 512, 1, 12288, 2048, 2048, 12288, 12288] + - [63, 4572.0] + - - [4096, 256, 1, 8192, 4096, 4096, 8192, 8192] + - [32, 3481.0] + - - [2048, 512, 1, 15360, 2048, 2048, 15360, 15360] + - [67, 4965.0] + - - [2048, 512, 1, 11264, 2048, 2048, 11264, 11264] + - [57, 5348.0] + - - [2048, 256, 1, 12288, 2048, 2048, 12288, 12288] + - [68, 3393.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 12288, 12288] + - [63, 5701.0] + - - [4096, 256, 1, 16384, 4096, 4096, 16384, 16384] + - [30, 4417.0] + - - [2048, 256, 1, 15360, 2048, 2048, 15360, 15360] + - [70, 3421.0] + - - [2048, 512, 1, 8320, 2048, 2048, 8320, 8320] + - [4, 8381.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 10240, 10240] + - [60, 7963.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 9216, 9216] + - [30, 9355.0] + - - [4096, 200, 1, 16384, 4096, 4096, 16384, 16384] + - [32, 3576.0] + - - [2048, 512, 1, 14336, 2048, 2048, 14336, 14336] + - [63, 4624.0] + - - [1024, 512, 1, 13312, 1024, 1024, 13312, 13312] + - [60, 7650.0] + - - [4096, 256, 1, 8320, 4096, 4096, 8320, 8320] + - [59, 10872.0] + - - [4096, 200, 1, 13312, 4096, 4096, 13312, 13312] + - [29, 3235.0] + - - [1024, 512, 1, 14336, 1024, 1024, 14336, 14336] + - [60, 6999.0] + - - [4096, 256, 1, 11264, 4096, 4096, 11264, 11264] + - [13, 3984.0] + - - [4096, 256, 1, 10240, 4096, 4096, 10240, 10240] + - [12, 3927.0] + - - [4096, 200, 1, 14336, 4096, 4096, 14336, 14336] + - [28, 3408.0] + - - [4096, 256, 1, 16640, 4096, 4096, 16640, 16640] + - [46, 5317.0] + - - [1024, 512, 1, 15360, 1024, 1024, 15360, 15360] + - [32, 5921.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 16384, 16384] + - [58, 5858.0] + - - [224, 192, 36, 10368, 224, 224, 10368, 10368] + - [54, 4434.0] + - - [320, 256, 9, 19584, 320, 320, 19584, 19584] + - [66, 7293.0] + - - [256, 256, 11, 13056, 256, 256, 13056, 13056] + - [31, 6487.0] + - - [320, 256, 9, 9792, 320, 320, 9792, 9792] + - [52, 9395.0] + - - [320, 256, 11, 13056, 320, 320, 13056, 13056] + - [63, 7487.0] + - - [256, 256, 9, 9792, 256, 256, 9792, 9792] + - [65, 10179.0] + - - [256, 224, 9, 19584, 256, 256, 19584, 19584] + - [52, 8011.0] + - - [256, 256, 9, 19584, 256, 256, 19584, 19584] + - [59, 8814.0] + - - [128, 128, 36, 12000, 128, 128, 12000, 12000] + - [28, 9409.0] + - - [128, 128, 49, 12800, 128, 128, 12800, 12800] + - [64, 5221.0] + - - [128, 128, 25, 25088, 128, 128, 25088, 25088] + - [69, 4703.0] + - - [128, 128, 49, 25600, 128, 128, 25600, 25600] + - [58, 4491.0] + - - [128, 128, 25, 50176, 128, 128, 50176, 50176] + - [64, 4258.0] + - - [128, 128, 36, 12544, 128, 128, 12544, 12544] + - [70, 7600.0] + - - [128, 128, 49, 9216, 128, 128, 9216, 9216] + - [70, 5540.0] + - - [1024, 1024, 1, 12544, 1024, 1024, 12544, 12544] + - [57, 7066.0] + - - [1024, 1000, 1, 12544, 1024, 1024, 12544, 12544] + - [53, 6800.0] + - - [1024, 512, 1, 1600, 1024, 1024, 1600, 1600] + - [76, 7912.0] + - - [2048, 512, 1, 100, 2048, 2048, 100, 100] + - [76, 5248.0] + - - [768, 640, 1, 768, 768, 768, 768, 768] + - [95, 7924.0] + - - [768, 1280, 1, 768, 768, 768, 768, 768] + - [113, 7817.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 7245.0] + - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] + - [157, 7977.0] + - - [30522, 120, 1, 1024, 30522, 30522, 1024, 1024] + - [157, 9346.0] + - - [30522, 80, 1, 1024, 30522, 30522, 1024, 1024] + - [97, 6347.0] + - - [64, 128, 512, 128, 64, 64, 128, 128] + - [156, 6763.0] + - - [64, 512, 64, 512, 64, 64, 512, 512] + - [156, 6827.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [153, 5815.0] + - - [64, 64, 96, 64, 64, 64, 64, 64] + - [72, 3848.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [128, 8641.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [161, 7770.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [119, 6827.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 128] + - [119, 6357.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [154, 8067.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [154, 8415.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [152, 7244.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [126, 6681.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [126, 8149.0] + - - [128, 5056, 1, 128, 128, 128, 128, 128] + - [119, 5892.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [120, 4961.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [152, 4978.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [154, 6760.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [126, 8157.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [95, 7957.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [151, 7502.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [149, 4269.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [159, 4323.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [126, 8557.0] + - - [704, 1024, 1, 128, 704, 704, 128, 128] + - [118, 5422.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [95, 8417.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [154, 8110.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [141, 6352.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [118, 5678.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [151, 6644.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [95, 8238.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 128] + - [150, 6240.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [157, 8210.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [95, 5137.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [152, 6559.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [158, 5753.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 128] + - [148, 4910.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [128, 9612.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [150, 5026.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [154, 7282.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [150, 6435.0] + - - [128, 4288, 1, 128, 128, 128, 128, 128] + - [138, 4886.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 128] + - [89, 4518.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 128] + - [151, 6635.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [154, 5072.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [95, 8026.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [130, 4488.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [154, 7677.0] + - - [448, 1856, 1, 128, 448, 448, 128, 128] + - [150, 6006.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [154, 7879.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [89, 5773.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [90, 7551.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [121, 7993.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [113, 9550.0] + - - [704, 1856, 1, 128, 704, 704, 128, 128] + - [109, 6755.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 128] + - [86, 6631.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [154, 6873.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [97, 8313.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [128, 6035.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [152, 8078.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [151, 8033.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [126, 8493.0] + - - [448, 2368, 1, 128, 448, 448, 128, 128] + - [121, 6322.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [95, 6994.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [97, 8463.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [152, 4823.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [128, 9020.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [151, 4972.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [134, 7010.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [123, 6951.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [157, 9235.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [95, 8413.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [90, 8349.0] + - - [448, 1024, 1, 128, 448, 448, 128, 128] + - [148, 4668.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [128, 8407.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 128] + - [88, 4134.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [126, 7055.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [90, 6404.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [97, 8865.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [123, 6683.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [95, 8872.0] + - - [256, 1856, 1, 128, 256, 256, 128, 128] + - [150, 5216.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [125, 4694.0] + - - [448, 1408, 1, 128, 448, 448, 128, 128] + - [123, 5538.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [97, 7556.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [123, 4864.0] + - - [704, 1408, 1, 128, 704, 704, 128, 128] + - [150, 6338.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [90, 7969.0] + - - [128, 2944, 1, 128, 128, 128, 128, 128] + - [150, 3890.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [95, 8115.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [89, 6245.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [152, 7450.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [97, 8260.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [130, 3676.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [154, 8034.0] + - - [256, 2368, 1, 128, 256, 256, 128, 128] + - [150, 5748.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [113, 7374.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [95, 8971.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [128, 8370.0] + - - [128, 5888, 1, 128, 128, 128, 128, 128] + - [89, 5970.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [151, 8033.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [90, 8744.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [154, 7635.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 128] + - [109, 5848.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [128, 7398.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [123, 5007.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [123, 7034.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [90, 8709.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [95, 8476.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [119, 6240.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [92, 6920.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [154, 8414.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [126, 8747.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [145, 6834.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [157, 5911.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 128] + - [138, 4142.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [95, 6380.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [119, 5173.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [125, 3584.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [126, 7036.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [126, 8702.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [154, 7695.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [121, 6784.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [152, 5152.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 128] + - [89, 5885.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [161, 8001.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [151, 5964.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 128] + - [112, 6706.0] + - - [704, 448, 1, 128, 704, 704, 128, 128] + - [91, 4427.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [128, 7877.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [95, 4629.0] + - - [128, 6784, 1, 128, 128, 128, 128, 128] + - [128, 5654.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [128, 6925.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [154, 8403.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [96, 5852.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [154, 8140.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [92, 5963.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [128, 7450.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [113, 7177.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [92, 7261.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [157, 8252.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [131, 6434.0] + - - [64, 5888, 1, 128, 64, 64, 128, 128] + - [123, 3621.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [134, 7994.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [123, 4783.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [154, 6822.0] + - - [704, 704, 1, 128, 704, 704, 128, 128] + - [118, 4651.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [118, 3927.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [156, 5769.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [152, 7615.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [90, 7607.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [86, 8471.0] + - - [256, 3584, 1, 128, 256, 256, 128, 128] + - [109, 6162.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 128] + - [139, 6256.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 128] + - [117, 5396.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [95, 5825.0] + - - [256, 2944, 1, 128, 256, 256, 128, 128] + - [76, 5289.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [126, 6736.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [95, 7242.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [154, 7814.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 128] + - [151, 6717.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [89, 4879.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [152, 6532.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [126, 7563.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [89, 4951.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [95, 6672.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [126, 8496.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [92, 7803.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [90, 8365.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 128] + - [137, 5652.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [97, 8862.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [152, 5740.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [126, 7861.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [119, 6979.0] + - - [448, 2944, 1, 128, 448, 448, 128, 128] + - [121, 6868.0] + - - [128, 3584, 1, 128, 128, 128, 128, 128] + - [150, 5133.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [150, 5015.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [161, 6174.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [126, 8800.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [126, 8227.0] + - - [64, 5056, 1, 128, 64, 64, 128, 128] + - [149, 4150.0] + - - [64, 6784, 1, 128, 64, 64, 128, 128] + - [125, 4718.0] + - - [448, 704, 1, 128, 448, 448, 128, 128] + - [75, 4053.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 128] + - [150, 6720.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [97, 9097.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [95, 8221.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [90, 9310.0] + - - [256, 1408, 1, 128, 256, 256, 128, 128] + - [89, 4076.0] + - - [256, 4288, 1, 128, 256, 256, 128, 128] + - [76, 6736.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [121, 6474.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [126, 8643.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [92, 6348.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [90, 7342.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [97, 7354.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [151, 9322.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [95, 6626.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [95, 7974.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [128, 9268.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [128, 8654.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [154, 6820.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [95, 6606.0] + - - [128, 2368, 1, 128, 128, 128, 128, 128] + - [106, 4924.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [95, 8223.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [95, 8106.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [97, 7556.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [126, 8719.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [121, 6500.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [151, 7025.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [103, 6454.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 128] + - [117, 4123.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [151, 7222.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [97, 9023.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [121, 7119.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [121, 7991.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [123, 7460.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [154, 8500.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [95, 8717.0] + - - [2048, 200, 1, 3200, 2048, 2048, 3200, 3200] + - [139, 6516.0] + - - [2048, 256, 1, 3328, 2048, 2048, 3328, 3328] + - [97, 7961.0] + - - [4096, 200, 1, 11264, 4096, 4096, 11264, 11264] + - [157, 3184.0] + - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] + - [121, 7824.0] + - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] + - [76, 5218.0] + - - [512, 1024, 1, 1536, 512, 512, 1536, 1536] + - [90, 7802.0] + - - [1024, 512, 1, 512, 1024, 1024, 512, 512] + - [90, 6890.0] + - - [2048, 512, 1, 640, 2048, 2048, 640, 640] + - [126, 8467.0] + - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] + - [126, 8095.0] + - - [2048, 256, 1, 2048, 2048, 2048, 2048, 2048] + - [157, 7734.0] + - - [1024, 512, 1, 128, 1024, 1024, 128, 128] + - [111, 5268.0] + - - [2048, 512, 1, 256, 2048, 2048, 256, 256] + - [95, 7494.0] + - - [4096, 200, 1, 2560, 4096, 4096, 2560, 2560] + - [121, 6679.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1152, 1152] + - [141, 8485.0] + - - [2048, 200, 1, 32, 2048, 2048, 32, 32] + - [155, 1158.0] + - - [512, 1024, 1, 2816, 512, 512, 2816, 2816] + - [157, 7889.0] + - - [2048, 200, 1, 2080, 2048, 2048, 2080, 2080] + - [78, 6527.0] + - - [2048, 200, 1, 1024, 2048, 2048, 1024, 1024] + - [154, 5792.0] + - - [4096, 200, 1, 4096, 4096, 4096, 4096, 4096] + - [128, 6205.0] + - - [1024, 512, 1, 11264, 1024, 1024, 11264, 11264] + - [161, 7148.0] + - - [1024, 1024, 1, 1792, 1024, 1024, 1792, 1792] + - [154, 8466.0] + - - [4096, 200, 1, 768, 4096, 4096, 768, 768] + - [154, 6570.0] + - - [4096, 256, 1, 1024, 4096, 4096, 1024, 1024] + - [154, 8339.0] + - - [1024, 512, 1, 256, 1024, 1024, 256, 256] + - [90, 6146.0] + - - [1024, 512, 1, 1408, 1024, 1024, 1408, 1408] + - [90, 7885.0] + - - [1024, 512, 1, 5632, 1024, 1024, 5632, 5632] + - [97, 8044.0] + - - [4096, 200, 1, 256, 4096, 4096, 256, 256] + - [150, 6509.0] + - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] + - [97, 7912.0] + - - [1024, 1024, 1, 4160, 1024, 1024, 4160, 4160] + - [90, 8900.0] + - - [2048, 256, 1, 384, 2048, 2048, 384, 384] + - [78, 6839.0] + - - [4096, 200, 1, 640, 4096, 4096, 640, 640] + - [151, 6549.0] + - - [1024, 1024, 1, 7168, 1024, 1024, 7168, 7168] + - [154, 8703.0] + - - [4096, 256, 1, 768, 4096, 4096, 768, 768] + - [126, 8361.0] + - - [2048, 256, 1, 6656, 2048, 2048, 6656, 6656] + - [86, 7419.0] + - - [2048, 200, 1, 3072, 2048, 2048, 3072, 3072] + - [128, 6191.0] + - - [1024, 512, 1, 2816, 1024, 1024, 2816, 2816] + - [143, 7953.0] + - - [4096, 256, 1, 7680, 4096, 4096, 7680, 7680] + - [97, 7913.0] + - - [4096, 200, 1, 1024, 4096, 4096, 1024, 1024] + - [90, 6044.0] + - - [2048, 200, 1, 1792, 2048, 2048, 1792, 1792] + - [121, 6114.0] + - - [1024, 1024, 1, 2816, 1024, 1024, 2816, 2816] + - [95, 8645.0] + - - [2048, 512, 1, 1536, 2048, 2048, 1536, 1536] + - [95, 8526.0] + - - [4096, 256, 1, 3072, 4096, 4096, 3072, 3072] + - [154, 8636.0] + - - [2048, 256, 1, 5632, 2048, 2048, 5632, 5632] + - [97, 8048.0] + - - [1024, 512, 1, 6656, 1024, 1024, 6656, 6656] + - [128, 8064.0] + - - [4096, 200, 1, 2080, 4096, 4096, 2080, 2080] + - [138, 7508.0] + - - [2048, 200, 1, 13312, 2048, 2048, 13312, 13312] + - [103, 3068.0] + - - [4096, 256, 1, 3584, 4096, 4096, 3584, 3584] + - [126, 7475.0] + - - [2048, 256, 1, 8192, 2048, 2048, 8192, 8192] + - [134, 6326.0] + - - [2048, 512, 1, 512, 2048, 2048, 512, 512] + - [126, 8054.0] + - - [2048, 512, 1, 1152, 2048, 2048, 1152, 1152] + - [112, 8539.0] + - - [2048, 200, 1, 9216, 2048, 2048, 9216, 9216] + - [151, 5339.0] + - - [2048, 200, 1, 2560, 2048, 2048, 2560, 2560] + - [157, 6025.0] + - - [2048, 256, 1, 4608, 2048, 2048, 4608, 4608] + - [128, 7992.0] + - - [2048, 256, 1, 3584, 2048, 2048, 3584, 3584] + - [97, 8007.0] + - - [1024, 512, 1, 640, 1024, 1024, 640, 640] + - [121, 7285.0] + - - [2048, 512, 1, 768, 2048, 2048, 768, 768] + - [126, 8328.0] + - - [2048, 200, 1, 1408, 2048, 2048, 1408, 1408] + - [109, 6228.0] + - - [4096, 200, 1, 2048, 4096, 4096, 2048, 2048] + - [95, 6759.0] + - - [1024, 1024, 1, 5632, 1024, 1024, 5632, 5632] + - [95, 8826.0] + - - [2048, 512, 1, 3584, 2048, 2048, 3584, 3584] + - [95, 8784.0] + - - [1024, 512, 1, 64, 1024, 1024, 64, 64] + - [150, 4849.0] + - - [4096, 200, 1, 7680, 4096, 4096, 7680, 7680] + - [97, 6423.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [95, 7799.0] + - - [2048, 200, 1, 896, 2048, 2048, 896, 896] + - [151, 5948.0] + - - [2048, 256, 1, 32, 2048, 2048, 32, 32] + - [72, 3396.0] + - - [2048, 256, 1, 1280, 2048, 2048, 1280, 1280] + - [113, 7604.0] + - - [4096, 256, 1, 4096, 4096, 4096, 4096, 4096] + - [157, 8104.0] + - - [2048, 256, 1, 11264, 2048, 2048, 11264, 11264] + - [130, 3719.0] + - - [4096, 200, 1, 9216, 4096, 4096, 9216, 9216] + - [97, 5690.0] + - - [1024, 512, 1, 4096, 1024, 1024, 4096, 4096] + - [157, 7459.0] + - - [4096, 200, 1, 3840, 4096, 4096, 3840, 3840] + - [128, 5860.0] + - - [1024, 1024, 1, 1920, 1024, 1024, 1920, 1920] + - [78, 8181.0] + - - [2048, 200, 1, 7168, 2048, 2048, 7168, 7168] + - [157, 5470.0] + - - [4096, 256, 1, 1152, 4096, 4096, 1152, 1152] + - [151, 8618.0] + - - [2048, 256, 1, 1920, 2048, 2048, 1920, 1920] + - [139, 7965.0] + - - [2048, 512, 1, 4160, 2048, 2048, 4160, 4160] + - [78, 8879.0] + - - [2048, 512, 1, 5632, 2048, 2048, 5632, 5632] + - [95, 8755.0] + - - [4096, 256, 1, 7168, 4096, 4096, 7168, 7168] + - [97, 7867.0] + - - [4096, 200, 1, 128, 4096, 4096, 128, 128] + - [146, 3533.0] + - - [2048, 200, 1, 5120, 2048, 2048, 5120, 5120] + - [157, 6293.0] + - - [1024, 1024, 1, 6656, 1024, 1024, 6656, 6656] + - [95, 8852.0] + - - [512, 1024, 1, 3200, 512, 512, 3200, 3200] + - [109, 8344.0] + - - [2048, 256, 1, 1536, 2048, 2048, 1536, 1536] + - [97, 7728.0] + - - [4096, 256, 1, 256, 4096, 4096, 256, 256] + - [113, 7411.0] + - - [2048, 512, 1, 1408, 2048, 2048, 1408, 1408] + - [141, 8638.0] + - - [1024, 512, 1, 2080, 1024, 1024, 2080, 2080] + - [78, 8498.0] + - - [2048, 512, 1, 2304, 2048, 2048, 2304, 2304] + - [95, 8709.0] + - - [4096, 200, 1, 512, 4096, 4096, 512, 512] + - [150, 6405.0] + - - [2048, 200, 1, 1280, 2048, 2048, 1280, 1280] + - [90, 6008.0] + - - [1024, 1024, 1, 2304, 1024, 1024, 2304, 2304] + - [95, 8696.0] + - - [2048, 512, 1, 4608, 2048, 2048, 4608, 4608] + - [95, 8828.0] + - - [4096, 256, 1, 6144, 4096, 4096, 6144, 6144] + - [100, 5780.0] + - - [4096, 256, 1, 896, 4096, 4096, 896, 896] + - [139, 8166.0] + - - [2048, 256, 1, 640, 2048, 2048, 640, 640] + - [109, 7388.0] + - - [2048, 512, 1, 384, 2048, 2048, 384, 384] + - [150, 8389.0] + - - [2048, 200, 1, 16384, 2048, 2048, 16384, 16384] + - [145, 4938.0] + - - [4096, 200, 1, 10240, 4096, 4096, 10240, 10240] + - [95, 4543.0] + - - [1024, 512, 1, 9216, 1024, 1024, 9216, 9216] + - [128, 7927.0] + - - [4096, 200, 1, 1920, 4096, 4096, 1920, 1920] + - [121, 6833.0] + - - [2048, 512, 1, 7680, 2048, 2048, 7680, 7680] + - [97, 7232.0] + - - [1024, 512, 1, 3584, 1024, 1024, 3584, 3584] + - [128, 7933.0] + - - [1024, 1024, 1, 32, 1024, 1024, 32, 32] + - [118, 3322.0] + - - [2048, 512, 1, 1664, 2048, 2048, 1664, 1664] + - [112, 8654.0] + - - [2048, 200, 1, 2048, 2048, 2048, 2048, 2048] + - [90, 6140.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 3584, 3584] + - [95, 8811.0] + - - [4096, 256, 1, 6656, 4096, 4096, 6656, 6656] + - [97, 7129.0] + - - [4096, 256, 1, 4160, 4096, 4096, 4160, 4160] + - [126, 8099.0] + - - [2048, 256, 1, 3072, 2048, 2048, 3072, 3072] + - [128, 7764.0] + - - [2048, 256, 1, 8320, 2048, 2048, 8320, 8320] + - [86, 7204.0] + - - [1024, 512, 1, 3200, 1024, 1024, 3200, 3200] + - [139, 8291.0] + - - [1024, 512, 1, 896, 1024, 1024, 896, 896] + - [78, 7863.0] + - - [2048, 512, 1, 1280, 2048, 2048, 1280, 1280] + - [95, 8480.0] + - - [4096, 200, 1, 64, 4096, 4096, 64, 64] + - [135, 3936.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 5120, 5120] + - [154, 8826.0] + - - [2048, 512, 1, 6656, 2048, 2048, 6656, 6656] + - [97, 8192.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 128] + - [89, 6541.0] + - - [512, 1024, 1, 1792, 512, 512, 1792, 1792] + - [128, 7822.0] + - - [4096, 256, 1, 2816, 4096, 4096, 2816, 2816] + - [154, 8753.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 4096, 4096] + - [97, 8490.0] + - - [2048, 200, 1, 4160, 2048, 2048, 4160, 4160] + - [151, 6773.0] + - - [1024, 512, 1, 768, 1024, 1024, 768, 768] + - [157, 7039.0] + - - [4096, 200, 1, 8320, 4096, 4096, 8320, 8320] + - [154, 6852.0] + - - [2048, 512, 1, 896, 2048, 2048, 896, 896] + - [150, 7953.0] + - - [4096, 200, 1, 7168, 4096, 4096, 7168, 7168] + - [103, 5311.0] + - - [2048, 200, 1, 3840, 2048, 2048, 3840, 3840] + - [143, 5933.0] + - - [1024, 1024, 1, 768, 1024, 1024, 768, 768] + - [126, 8139.0] + - - [4096, 256, 1, 2304, 4096, 4096, 2304, 2304] + - [95, 8687.0] + - - [2048, 200, 1, 16640, 2048, 2048, 16640, 16640] + - [103, 5635.0] + - - [2048, 256, 1, 2816, 2048, 2048, 2816, 2816] + - [97, 7400.0] + - - [1024, 512, 1, 384, 1024, 1024, 384, 384] + - [89, 6853.0] + - - [2048, 200, 1, 7680, 2048, 2048, 7680, 7680] + - [103, 5501.0] + - - [1024, 512, 1, 4608, 1024, 1024, 4608, 4608] + - [157, 7955.0] + - - [4096, 200, 1, 32, 4096, 4096, 32, 32] + - [124, 3236.0] + - - [4096, 200, 1, 3328, 4096, 4096, 3328, 3328] + - [97, 6372.0] + - - [1024, 1024, 1, 1408, 1024, 1024, 1408, 1408] + - [112, 8566.0] + - - [2048, 200, 1, 15360, 2048, 2048, 15360, 15360] + - [103, 5629.0] + - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] + - [97, 7071.0] + - - [4096, 256, 1, 5632, 4096, 4096, 5632, 5632] + - [97, 4587.0] + - - [2048, 256, 1, 1408, 2048, 2048, 1408, 1408] + - [121, 7892.0] + - - [2048, 256, 1, 6144, 2048, 2048, 6144, 6144] + - [157, 8013.0] + - - [4096, 256, 1, 3328, 4096, 4096, 3328, 3328] + - [86, 7989.0] + - - [2048, 512, 1, 6144, 2048, 2048, 6144, 6144] + - [97, 8353.0] + - - [2048, 512, 1, 3200, 2048, 2048, 3200, 3200] + - [141, 8713.0] + - - [2048, 200, 1, 4608, 2048, 2048, 4608, 4608] + - [90, 6388.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 6144, 6144] + - [154, 8787.0] + - - [4096, 256, 1, 1664, 4096, 4096, 1664, 1664] + - [139, 8700.0] + - - [2048, 200, 1, 384, 2048, 2048, 384, 384] + - [119, 5285.0] + - - [4096, 256, 1, 1792, 4096, 4096, 1792, 1792] + - [154, 8674.0] + - - [2048, 512, 1, 2816, 2048, 2048, 2816, 2816] + - [95, 8743.0] + - - [4096, 256, 1, 384, 4096, 4096, 384, 384] + - [121, 7989.0] + - - [2048, 256, 1, 128, 2048, 2048, 128, 128] + - [89, 5023.0] + - - [1024, 1024, 1, 640, 1024, 1024, 640, 640] + - [112, 8303.0] + - - [4096, 200, 1, 5632, 4096, 4096, 5632, 5632] + - [97, 4444.0] + - - [2048, 200, 1, 1152, 2048, 2048, 1152, 1152] + - [128, 6068.0] + - - [4096, 256, 1, 512, 4096, 4096, 512, 512] + - [126, 8107.0] + - - [1024, 1024, 1, 384, 1024, 1024, 384, 384] + - [150, 8171.0] + - - [2048, 200, 1, 512, 2048, 2048, 512, 512] + - [121, 5470.0] + - - [2048, 256, 1, 9216, 2048, 2048, 9216, 9216] + - [103, 5089.0] + - - [2048, 256, 1, 1792, 2048, 2048, 1792, 1792] + - [128, 7820.0] + - - [4096, 200, 1, 1792, 4096, 4096, 1792, 1792] + - [126, 6799.0] + - - [2048, 200, 1, 1536, 2048, 2048, 1536, 1536] + - [90, 6122.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] + - [97, 8435.0] + - - [1024, 1024, 1, 2080, 1024, 1024, 2080, 2080] + - [107, 9590.0] + - - [2048, 200, 1, 2304, 2048, 2048, 2304, 2304] + - [90, 6217.0] + - - [2048, 256, 1, 7168, 2048, 2048, 7168, 7168] + - [134, 6845.0] + - - [2048, 512, 1, 1792, 2048, 2048, 1792, 1792] + - [126, 8656.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 4608, 4608] + - [95, 8841.0] + - - [512, 1024, 1, 1280, 512, 512, 1280, 1280] + - [90, 7748.0] + - - [2048, 256, 1, 3200, 2048, 2048, 3200, 3200] + - [139, 8320.0] + - - [1024, 512, 1, 3328, 1024, 1024, 3328, 3328] + - [97, 8016.0] + - - [1024, 512, 1, 4160, 1024, 1024, 4160, 4160] + - [109, 8736.0] + - - [4096, 200, 1, 6656, 4096, 4096, 6656, 6656] + - [97, 5596.0] + - - [2048, 200, 1, 3328, 2048, 2048, 3328, 3328] + - [90, 6161.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [97, 7822.0] + - - [2048, 256, 1, 64, 2048, 2048, 64, 64] + - [76, 4964.0] + - - [2048, 256, 1, 2304, 2048, 2048, 2304, 2304] + - [157, 7902.0] + - - [4096, 200, 1, 8192, 4096, 4096, 8192, 8192] + - [103, 5145.0] + - - [1024, 512, 1, 7168, 1024, 1024, 7168, 7168] + - [128, 7854.0] + - - [1024, 512, 1, 1792, 1024, 1024, 1792, 1792] + - [143, 7771.0] + - - [4096, 200, 1, 2816, 4096, 4096, 2816, 2816] + - [126, 6901.0] + - - [1024, 1024, 1, 896, 1024, 1024, 896, 896] + - [90, 8515.0] + - - [4096, 256, 1, 5120, 4096, 4096, 5120, 5120] + - [157, 5939.0] + - - [4096, 256, 1, 2048, 4096, 4096, 2048, 2048] + - [126, 8481.0] + - - [2048, 256, 1, 5120, 2048, 2048, 5120, 5120] + - [128, 7998.0] + - - [2048, 256, 1, 7680, 2048, 2048, 7680, 7680] + - [145, 6731.0] + - - [2048, 200, 1, 3584, 2048, 2048, 3584, 3584] + - [90, 6252.0] + - - [1024, 512, 1, 1536, 1024, 1024, 1536, 1536] + - [97, 7659.0] + - - [2048, 200, 1, 64, 2048, 2048, 64, 64] + - [107, 2819.0] + - - [2048, 200, 1, 4096, 2048, 2048, 4096, 4096] + - [97, 6142.0] + - - [1024, 1024, 1, 1536, 1024, 1024, 1536, 1536] + - [95, 8453.0] + - - [4096, 256, 1, 32, 4096, 4096, 32, 32] + - [82, 3745.0] + - - [4096, 256, 1, 1280, 4096, 4096, 1280, 1280] + - [126, 8546.0] + - - [2048, 256, 1, 1024, 2048, 2048, 1024, 1024] + - [97, 7419.0] + - - [1024, 512, 1, 1152, 1024, 1024, 1152, 1152] + - [78, 7785.0] + - - [2048, 512, 1, 3328, 2048, 2048, 3328, 3328] + - [95, 8805.0] + - - [4096, 200, 1, 3584, 4096, 4096, 3584, 3584] + - [154, 6123.0] + - - [2048, 200, 1, 256, 2048, 2048, 256, 256] + - [150, 4806.0] + - - [4096, 256, 1, 1920, 4096, 4096, 1920, 1920] + - [141, 8734.0] + - - [2048, 256, 1, 1664, 2048, 2048, 1664, 1664] + - [109, 7983.0] + - - [4096, 200, 1, 5120, 4096, 4096, 5120, 5120] + - [128, 5690.0] + - - [1024, 512, 1, 8192, 1024, 1024, 8192, 8192] + - [157, 7356.0] + - - [4096, 200, 1, 896, 4096, 4096, 896, 896] + - [151, 6659.0] + - - [2048, 200, 1, 640, 2048, 2048, 640, 640] + - [139, 5838.0] + - - [4096, 200, 1, 1408, 4096, 4096, 1408, 1408] + - [121, 6801.0] + - - [2048, 200, 1, 5632, 2048, 2048, 5632, 5632] + - [90, 6423.0] + - - [1024, 512, 1, 2560, 1024, 1024, 2560, 2560] + - [97, 7896.0] + - - [4096, 200, 1, 1280, 4096, 4096, 1280, 1280] + - [126, 6697.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 2560, 2560] + - [95, 8636.0] + - - [2048, 512, 1, 64, 2048, 2048, 64, 64] + - [89, 5092.0] + - - [2048, 200, 1, 8192, 2048, 2048, 8192, 8192] + - [103, 5355.0] + - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] + - [157, 8319.0] + - - [4096, 256, 1, 640, 4096, 4096, 640, 640] + - [139, 8271.0] + - - [2048, 256, 1, 4096, 2048, 2048, 4096, 4096] + - [157, 7971.0] + - - [4096, 200, 1, 1664, 4096, 4096, 1664, 1664] + - [121, 6799.0] + - - [2048, 200, 1, 6656, 2048, 2048, 6656, 6656] + - [113, 6202.0] + - - [512, 1024, 1, 768, 512, 512, 768, 768] + - [86, 7326.0] + - - [2048, 200, 1, 8320, 2048, 2048, 8320, 8320] + - [86, 5915.0] + - - [4096, 256, 1, 3840, 4096, 4096, 3840, 3840] + - [97, 7492.0] + - - [1024, 1024, 1, 3200, 1024, 1024, 3200, 3200] + - [154, 8783.0] + - - [4096, 256, 1, 4608, 4096, 4096, 4608, 4608] + - [97, 6271.0] + - - [1024, 512, 1, 32, 1024, 1024, 32, 32] + - [85, 2778.0] + - - [1024, 512, 1, 3840, 1024, 1024, 3840, 3840] + - [143, 7976.0] + - - [2048, 512, 1, 1920, 2048, 2048, 1920, 1920] + - [126, 8675.0] + - - [4096, 200, 1, 6144, 4096, 4096, 6144, 6144] + - [97, 6627.0] + - - [2048, 200, 1, 2816, 2048, 2048, 2816, 2816] + - [121, 5949.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 3840, 3840] + - [95, 8478.0] + - - [2048, 256, 1, 3840, 2048, 2048, 3840, 3840] + - [157, 7920.0] + - - [1024, 512, 1, 7680, 1024, 1024, 7680, 7680] + - [143, 8105.0] + - - [2048, 200, 1, 10240, 2048, 2048, 10240, 10240] + - [103, 4313.0] + - - [2048, 512, 1, 5120, 2048, 2048, 5120, 5120] + - [154, 8604.0] + - - [512, 1024, 1, 512, 512, 512, 512, 512] + - [128, 6940.0] + - - [2048, 512, 1, 32, 2048, 2048, 32, 32] + - [104, 3322.0] + - - [4096, 256, 1, 2560, 4096, 4096, 2560, 2560] + - [154, 8758.0] + - - [4096, 256, 1, 64, 4096, 4096, 64, 64] + - [89, 5008.0] + - - [2048, 200, 1, 768, 2048, 2048, 768, 768] + - [151, 5933.0] + - - [2048, 512, 1, 2560, 2048, 2048, 2560, 2560] + - [95, 8710.0] + - - [2048, 512, 1, 7168, 2048, 2048, 7168, 7168] + - [97, 7431.0] + - - [2048, 512, 1, 128, 2048, 2048, 128, 128] + - [119, 6404.0] + - - [4096, 200, 1, 2304, 4096, 4096, 2304, 2304] + - [154, 6852.0] + - - [2048, 512, 1, 4096, 2048, 2048, 4096, 4096] + - [97, 8383.0] + - - [2048, 256, 1, 2560, 2048, 2048, 2560, 2560] + - [97, 7910.0] + - - [2048, 256, 1, 4160, 2048, 2048, 4160, 4160] + - [109, 8685.0] + - - [1024, 512, 1, 1664, 1024, 1024, 1664, 1664] + - [109, 8033.0] + - - [2048, 512, 1, 2080, 2048, 2048, 2080, 2080] + - [76, 9642.0] + - - [2048, 512, 1, 3840, 2048, 2048, 3840, 3840] + - [95, 8731.0] + - - [4096, 200, 1, 3072, 4096, 4096, 3072, 3072] + - [154, 6869.0] + - - [1024, 1024, 1, 1664, 1024, 1024, 1664, 1664] + - [112, 8682.0] + - - [512, 1024, 1, 2304, 512, 512, 2304, 2304] + - [128, 7898.0] + - - [4096, 256, 1, 1408, 4096, 4096, 1408, 1408] + - [154, 8651.0] + - - [2048, 256, 1, 1152, 2048, 2048, 1152, 1152] + - [121, 7820.0] + - - [1024, 512, 1, 1280, 1024, 1024, 1280, 1280] + - [90, 7682.0] + - - [2048, 200, 1, 12288, 2048, 2048, 12288, 12288] + - [152, 2899.0] + - - [2048, 200, 1, 1664, 2048, 2048, 1664, 1664] + - [139, 5979.0] + - - [4096, 200, 1, 4608, 4096, 4096, 4608, 4608] + - [97, 5662.0] + - - [512, 1024, 1, 2560, 512, 512, 2560, 2560] + - [128, 8027.0] + - - [4096, 200, 1, 384, 4096, 4096, 384, 384] + - [121, 6232.0] + - - [2048, 200, 1, 128, 2048, 2048, 128, 128] + - [81, 4052.0] + - - [2048, 200, 1, 11264, 2048, 2048, 11264, 11264] + - [123, 3155.0] + - - [1024, 512, 1, 1920, 1024, 1024, 1920, 1920] + - [139, 8017.0] + - - [4096, 256, 1, 1536, 4096, 4096, 1536, 1536] + - [126, 8628.0] + - - [2048, 256, 1, 256, 2048, 2048, 256, 256] + - [90, 6278.0] + - - [2048, 256, 1, 10240, 2048, 2048, 10240, 10240] + - [103, 5405.0] + - - [1024, 512, 1, 5120, 1024, 1024, 5120, 5120] + - [157, 7939.0] + - - [1024, 512, 1, 8320, 1024, 1024, 8320, 8320] + - [139, 8483.0] + - - [1024, 512, 1, 10240, 1024, 1024, 10240, 10240] + - [157, 7618.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] + - [154, 8079.0] + - - [2048, 256, 1, 2080, 2048, 2048, 2080, 2080] + - [109, 8504.0] + - - [4096, 256, 1, 128, 4096, 4096, 128, 128] + - [151, 6337.0] + - - [2048, 256, 1, 896, 2048, 2048, 896, 896] + - [121, 7604.0] + - - [4096, 200, 1, 1152, 4096, 4096, 1152, 1152] + - [151, 6711.0] + - - [2048, 200, 1, 6144, 2048, 2048, 6144, 6144] + - [128, 6315.0] + - - [1024, 1024, 1, 7680, 1024, 1024, 7680, 7680] + - [141, 8530.0] + - - [2048, 200, 1, 1920, 2048, 2048, 1920, 1920] + - [139, 6285.0] + - - [4096, 256, 1, 2080, 4096, 4096, 2080, 2080] + - [89, 9593.0] + - - [2048, 200, 1, 14336, 2048, 2048, 14336, 14336] + - [103, 5565.0] + - - [1024, 512, 1, 6144, 1024, 1024, 6144, 6144] + - [157, 7788.0] + - - [1024, 512, 1, 2304, 1024, 1024, 2304, 2304] + - [90, 7904.0] + - - [4096, 200, 1, 4160, 4096, 4096, 4160, 4160] + - [128, 6542.0] + - - [4096, 200, 1, 1536, 4096, 4096, 1536, 1536] + - [154, 6787.0] + - - [2048, 320, 1, 64, 2048, 2048, 64, 64] + - [76, 4018.0] + - - [2048, 384, 1, 64, 2048, 2048, 64, 64] + - [107, 4849.0] + - - [1024, 384, 1, 289, 1024, 1024, 289, 289] + - [107, 5919.0] + - - [2048, 448, 1, 64, 2048, 2048, 64, 64] + - [107, 5943.0] + - - [102, 101, 624, 64, 102, 102, 64, 64] + - [137, 4484.0] + - - [101, 101, 624, 64, 101, 101, 64, 64] + - [81, 4470.0] + - - [85, 85, 752, 64, 85, 85, 64, 64] + - [136, 4054.0] + - - [112, 111, 576, 64, 112, 112, 64, 64] + - [110, 5661.0] + - - [65, 65, 992, 64, 65, 65, 64, 64] + - [105, 2663.0] + - - [77, 77, 816, 64, 77, 77, 64, 64] + - [105, 3599.0] + - - [111, 111, 576, 64, 111, 111, 64, 64] + - [81, 5017.0] + - - [84, 85, 752, 64, 84, 84, 64, 64] + - [73, 4046.0] + - - [84, 84, 752, 64, 84, 84, 64, 64] + - [73, 4051.0] + - - [71, 71, 896, 64, 71, 71, 64, 64] + - [87, 3100.0] + - - [122, 122, 528, 64, 122, 122, 64, 64] + - [83, 5248.0] + - - [78, 78, 816, 64, 78, 78, 64, 64] + - [73, 3689.0] + - - [112, 112, 576, 64, 112, 112, 64, 64] + - [140, 5712.0] + - - [77, 78, 816, 64, 77, 77, 64, 64] + - [105, 3605.0] + - - [111, 112, 576, 64, 111, 111, 64, 64] + - [140, 5079.0] + - - [92, 93, 688, 64, 92, 92, 64, 64] + - [105, 4345.0] + - - [102, 102, 624, 64, 102, 102, 64, 64] + - [137, 4537.0] + - - [99, 99, 624, 64, 99, 99, 64, 64] + - [81, 4401.0] + - - [100, 102, 624, 64, 100, 100, 64, 64] + - [81, 4469.0] + - - [123, 122, 528, 64, 123, 123, 64, 64] + - [83, 5312.0] + - - [99, 102, 624, 64, 99, 99, 64, 64] + - [81, 4511.0] + - - [93, 93, 688, 64, 93, 93, 64, 64] + - [73, 4342.0] + - - [123, 123, 528, 64, 123, 123, 64, 64] + - [153, 5255.0] + - - [100, 100, 624, 64, 100, 100, 64, 64] + - [140, 4413.0] + - - [101, 102, 624, 64, 101, 101, 64, 64] + - [81, 4534.0] + - - [102, 100, 624, 64, 102, 102, 64, 64] + - [140, 4527.0] + - - [92, 92, 688, 64, 92, 92, 64, 64] + - [73, 4317.0] + - - [3072, 128, 1, 4096, 3072, 3072, 4096, 4096] + - [123, 6954.0] + - - [1728, 320, 1, 64, 1728, 1728, 64, 64] + - [118, 3898.0] + - - [1440, 320, 1, 196, 1440, 1440, 196, 196] + - [119, 6369.0] + - - [2592, 384, 1, 289, 2592, 2592, 289, 289] + - [138, 7808.0] + - - [192, 80, 36, 10368, 192, 192, 10368, 10368] + - [127, 4087.0] + - - [1280, 384, 1, 64, 1280, 1280, 64, 64] + - [77, 4117.0] + - - [1280, 448, 1, 64, 1280, 1280, 64, 64] + - [76, 3896.0] + - - [3456, 256, 1, 169, 3456, 3456, 169, 169] + - [121, 7007.0] + - - [2304, 256, 1, 196, 2304, 2304, 196, 196] + - [76, 6509.0] + - - [224, 192, 36, 2592, 224, 224, 2592, 2592] + - [84, 8433.0] + - - [192, 128, 36, 1568, 192, 192, 1568, 1568] + - [78, 7370.0] + - - [1296, 288, 1, 196, 1296, 1296, 196, 196] + - [150, 4630.0] + - - [192, 64, 36, 6272, 192, 192, 6272, 6272] + - [152, 5683.0] + - - [1728, 224, 1, 1225, 1728, 1728, 1225, 1225] + - [76, 7333.0] + - - [1152, 384, 1, 64, 1152, 1152, 64, 64] + - [72, 3300.0] + - - [1792, 256, 1, 289, 1792, 1792, 289, 289] + - [154, 6827.0] + - - [1728, 384, 1, 169, 1728, 1728, 169, 169] + - [73, 6182.0] + - - [1568, 256, 1, 289, 1568, 1568, 289, 289] + - [76, 6093.0] + - - [1152, 448, 1, 64, 1152, 1152, 64, 64] + - [75, 3762.0] + - - [1536, 256, 1, 64, 1536, 1536, 64, 64] + - [147, 3329.0] + - - [1440, 320, 1, 49, 1440, 1440, 49, 49] + - [104, 3068.0] + - - [1344, 512, 1, 64, 1344, 1344, 64, 64] + - [118, 4352.0] + - - [1152, 256, 1, 196, 1152, 1152, 196, 196] + - [138, 4857.0] + - - [1728, 192, 1, 1225, 1728, 1728, 1225, 1225] + - [138, 6344.0] + - - [2048, 512, 1, 49, 2048, 2048, 49, 49] + - [104, 4875.0] + - - [512, 2048, 1, 49, 512, 512, 49, 49] + - [73, 4347.0] + - - [1728, 192, 1, 64, 1728, 1728, 64, 64] + - [104, 4115.0] + - - [1536, 384, 1, 64, 1536, 1536, 64, 64] + - [107, 5393.0] + - - [2048, 192, 1, 64, 2048, 2048, 64, 64] + - [137, 4354.0] + - - [128, 96, 36, 1568, 128, 128, 1568, 1568] + - [151, 6857.0] + - - [128, 128, 36, 3136, 128, 128, 3136, 3136] + - [128, 8771.0] + - - [1280, 320, 1, 64, 1280, 1280, 64, 64] + - [74, 3682.0] + - - [1792, 320, 1, 289, 1792, 1792, 289, 289] + - [138, 6519.0] + - - [2880, 320, 1, 64, 2880, 2880, 64, 64] + - [105, 5201.0] + - - [1728, 384, 1, 49, 1728, 1728, 49, 49] + - [106, 3519.0] + - - [512, 1024, 1, 196, 512, 512, 196, 196] + - [138, 5919.0] + - - [224, 192, 36, 5184, 224, 224, 5184, 5184] + - [141, 8466.0] + - - [192, 80, 36, 20736, 192, 192, 20736, 20736] + - [156, 2734.0] + - - [224, 192, 64, 4608, 224, 224, 4608, 4608] + - [129, 3705.0] + - - [224, 192, 64, 2304, 224, 224, 2304, 2304] + - [128, 6943.0] + - - [192, 80, 49, 14400, 192, 192, 14400, 14400] + - [97, 2968.0] + - - [224, 192, 49, 6272, 224, 224, 6272, 6272] + - [143, 5807.0] + - - [224, 192, 49, 3136, 224, 224, 3136, 3136] + - [84, 8432.0] + - - [192, 80, 36, 41472, 192, 192, 41472, 41472] + - [98, 3106.0] + - - [192, 80, 49, 28800, 192, 192, 28800, 28800] + - [115, 3356.0] + - - [192, 80, 64, 9216, 192, 192, 9216, 9216] + - [134, 2339.0] + - - [256, 224, 9, 9792, 256, 256, 9792, 9792] + - [113, 8048.0] + - - [256, 256, 9, 4896, 256, 256, 4896, 4896] + - [151, 9618.0] + - - [320, 256, 9, 4896, 320, 320, 4896, 4896] + - [139, 8277.0] + - - [224, 192, 9, 19584, 224, 224, 19584, 19584] + - [141, 6956.0] + - - [192, 192, 11, 3264, 192, 192, 3264, 3264] + - [105, 7114.0] + - - [192, 192, 11, 6528, 192, 192, 6528, 6528] + - [125, 6437.0] + - - [192, 192, 9, 4896, 192, 192, 4896, 4896] + - [107, 6538.0] + - - [224, 192, 11, 6528, 224, 224, 6528, 6528] + - [84, 6608.0] + - - [192, 192, 9, 19584, 192, 192, 19584, 19584] + - [126, 5735.0] + - - [256, 224, 11, 13056, 256, 256, 13056, 13056] + - [103, 5102.0] + - - [224, 192, 11, 13056, 224, 224, 13056, 13056] + - [100, 5157.0] + - - [256, 256, 11, 3264, 256, 256, 3264, 3264] + - [112, 7031.0] + - - [320, 256, 11, 6528, 320, 320, 6528, 6528] + - [84, 7525.0] + - - [192, 192, 9, 9792, 192, 192, 9792, 9792] + - [123, 6867.0] + - - [224, 224, 9, 9792, 224, 224, 9792, 9792] + - [97, 7025.0] + - - [224, 192, 11, 3264, 224, 224, 3264, 3264] + - [76, 7027.0] + - - [224, 224, 11, 6528, 224, 224, 6528, 6528] + - [141, 6133.0] + - - [224, 224, 9, 19584, 224, 224, 19584, 19584] + - [157, 6544.0] + - - [192, 192, 11, 13056, 192, 192, 13056, 13056] + - [99, 5072.0] + - - [224, 224, 9, 4896, 224, 224, 4896, 4896] + - [128, 6507.0] + - - [320, 256, 11, 3264, 320, 320, 3264, 3264] + - [143, 6837.0] + - - [256, 256, 11, 6528, 256, 256, 6528, 6528] + - [128, 7720.0] + - - [224, 192, 9, 4896, 224, 224, 4896, 4896] + - [76, 7898.0] + - - [224, 224, 11, 13056, 224, 224, 13056, 13056] + - [128, 5748.0] + - - [224, 224, 11, 3264, 224, 224, 3264, 3264] + - [76, 6728.0] + - - [256, 224, 11, 6528, 256, 256, 6528, 6528] + - [154, 6989.0] + - - [256, 224, 11, 3264, 256, 256, 3264, 3264] + - [76, 7896.0] + - - [224, 192, 9, 9792, 224, 224, 9792, 9792] + - [107, 7436.0] + - - [256, 224, 9, 4896, 256, 256, 4896, 4896] + - [76, 8043.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [125, 4521.0] + - - [135, 135, 32, 64, 135, 135, 64, 64] + - [136, 2740.0] + - - [64, 65, 496, 64, 64, 64, 64, 64] + - [140, 3347.0] + - - [65, 65, 472, 64, 65, 65, 64, 64] + - [136, 2386.0] + - - [65, 65, 496, 64, 65, 65, 64, 64] + - [136, 2409.0] + - - [70, 70, 216, 64, 70, 70, 64, 64] + - [136, 2342.0] + - - [70, 71, 216, 64, 70, 70, 64, 64] + - [105, 2384.0] + - - [71, 71, 216, 64, 71, 71, 64, 64] + - [73, 2452.0] + - - [71, 71, 448, 64, 71, 71, 64, 64] + - [105, 2690.0] + - - [77, 77, 248, 64, 77, 77, 64, 64] + - [136, 2725.0] + - - [77, 77, 408, 64, 77, 77, 64, 64] + - [105, 2842.0] + - - [77, 78, 248, 64, 77, 77, 64, 64] + - [73, 2724.0] + - - [77, 78, 408, 64, 77, 77, 64, 64] + - [105, 2870.0] + - - [78, 78, 248, 64, 78, 78, 64, 64] + - [105, 2767.0] + - - [78, 78, 408, 64, 78, 78, 64, 64] + - [73, 2901.0] + - - [80, 80, 152, 64, 80, 80, 64, 64] + - [136, 2779.0] + - - [80, 84, 152, 64, 80, 80, 64, 64] + - [73, 2958.0] + - - [84, 84, 152, 64, 84, 84, 64, 64] + - [136, 2851.0] + - - [85, 85, 376, 64, 85, 85, 64, 64] + - [105, 3168.0] + - - [93, 93, 344, 64, 93, 93, 64, 64] + - [79, 3638.0] + - - [102, 102, 312, 64, 102, 102, 64, 64] + - [140, 3847.0] + - - [112, 112, 288, 64, 112, 112, 64, 64] + - [81, 4732.0] + - - [122, 122, 264, 64, 122, 122, 64, 64] + - [83, 4608.0] + - - [123, 122, 264, 64, 123, 123, 64, 64] + - [142, 4580.0] + - - [123, 123, 264, 64, 123, 123, 64, 64] + - [83, 4843.0] + - - [511, 2048, 1, 2048, 511, 511, 2048, 2048] + - [95, 8594.0] + - - [1024, 512, 1, 1025, 1024, 1024, 1025, 1025] + - [90, 8130.0] + - - [512, 1023, 1, 1024, 512, 512, 1024, 1024] + - [157, 7325.0] + - - [1025, 1024, 1, 1024, 1025, 1025, 1024, 1024] + - [121, 8109.0] + - - [2048, 513, 1, 2048, 2048, 2048, 2048, 2048] + - [97, 8222.0] + - - [1024, 1024, 1, 1025, 1024, 1024, 1025, 1025] + - [76, 8745.0] + - - [960, 1024, 1, 1023, 960, 960, 1023, 1023] + - [76, 8337.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 8439.0] + - - [960, 1025, 1, 1024, 960, 960, 1024, 1024] + - [95, 7766.0] + - - [2049, 512, 1, 2048, 2049, 2049, 2048, 2048] + - [95, 8599.0] + - - [513, 1024, 1, 1024, 513, 513, 1024, 1024] + - [151, 7401.0] + - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] + - [95, 8643.0] + - - [1024, 511, 1, 1024, 1024, 1024, 1024, 1024] + - [157, 7318.0] + - - [1024, 512, 1, 1023, 1024, 1024, 1023, 1023] + - [78, 8010.0] + - - [960, 1024, 1, 1025, 960, 960, 1025, 1025] + - [138, 8307.0] + - - [959, 1024, 1, 1024, 959, 959, 1024, 1024] + - [150, 7953.0] + - - [2048, 512, 1, 2049, 2048, 2048, 2049, 2049] + - [109, 8701.0] + - - [511, 1024, 1, 1024, 511, 511, 1024, 1024] + - [90, 7436.0] + - - [512, 2049, 1, 2048, 512, 512, 2048, 2048] + - [154, 8601.0] + - - [1024, 513, 1, 1024, 1024, 1024, 1024, 1024] + - [97, 7469.0] + - - [2048, 512, 1, 2047, 2048, 2048, 2047, 2047] + - [107, 8793.0] + - - [1025, 512, 1, 1024, 1025, 1025, 1024, 1024] + - [90, 7556.0] + - - [1024, 1024, 1, 1023, 1024, 1024, 1023, 1023] + - [76, 8884.0] + - - [513, 2048, 1, 2048, 513, 513, 2048, 2048] + - [97, 8257.0] + - - [1024, 1025, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 8254.0] + - - [512, 2048, 1, 2049, 512, 512, 2049, 2049] + - [109, 8738.0] + - - [1024, 1023, 1, 1024, 1024, 1024, 1024, 1024] + - [95, 8257.0] + - - [960, 1023, 1, 1024, 960, 960, 1024, 1024] + - [150, 8025.0] + - - [2048, 511, 1, 2048, 2048, 2048, 2048, 2048] + - [95, 8481.0] + - - [1023, 512, 1, 1024, 1023, 1023, 1024, 1024] + - [157, 7095.0] + - - [2047, 512, 1, 2048, 2047, 2047, 2048, 2048] + - [126, 8443.0] + - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] + - [90, 7415.0] + - - [512, 1024, 1, 1025, 512, 512, 1025, 1025] + - [109, 7968.0] + - - [512, 2047, 1, 2048, 512, 512, 2048, 2048] + - [126, 8588.0] + - - [512, 1025, 1, 1024, 512, 512, 1024, 1024] + - [90, 7569.0] + - - [512, 2048, 1, 2047, 512, 512, 2047, 2047] + - [78, 8767.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [95, 7887.0] + - - [961, 1024, 1, 1024, 961, 961, 1024, 1024] + - [95, 7873.0] + - - [512, 1024, 1, 1023, 512, 512, 1023, 1023] + - [78, 8065.0] + - - [1023, 1024, 1, 1024, 1023, 1023, 1024, 1024] + - [95, 8314.0] + - - [479, 1024, 1, 1024, 479, 479, 1024, 1024] + - [128, 6869.0] + - - [479, 2048, 1, 2048, 479, 479, 2048, 2048] + - [95, 8078.0] + - - [480, 1023, 1, 1024, 480, 480, 1024, 1024] + - [90, 6894.0] + - - [480, 1024, 1, 1023, 480, 480, 1023, 1023] + - [78, 7474.0] + - - [480, 1024, 1, 1025, 480, 480, 1025, 1025] + - [109, 7422.0] + - - [480, 1025, 1, 1024, 480, 480, 1024, 1024] + - [90, 7092.0] + - - [480, 2047, 1, 2048, 480, 480, 2048, 2048] + - [126, 8083.0] + - - [480, 2048, 1, 2047, 480, 480, 2047, 2047] + - [76, 8341.0] + - - [480, 2048, 1, 2049, 480, 480, 2049, 2049] + - [78, 8198.0] + - - [480, 2049, 1, 2048, 480, 480, 2048, 2048] + - [95, 8109.0] + - - [480, 3071, 1, 3072, 480, 480, 3072, 3072] + - [97, 9346.0] + - - [481, 1024, 1, 1024, 481, 481, 1024, 1024] + - [157, 7044.0] + - - [481, 2048, 1, 2048, 481, 481, 2048, 2048] + - [95, 8065.0] + - - [1023, 480, 1, 1024, 1023, 1023, 1024, 1024] + - [128, 6804.0] + - - [1024, 479, 1, 1024, 1024, 1024, 1024, 1024] + - [151, 5952.0] + - - [1024, 480, 1, 1023, 1024, 1024, 1023, 1023] + - [109, 7432.0] + - - [1024, 480, 1, 1025, 1024, 1024, 1025, 1025] + - [109, 7459.0] + - - [1024, 481, 1, 1024, 1024, 1024, 1024, 1024] + - [90, 6818.0] + - - [1025, 480, 1, 1024, 1025, 1025, 1024, 1024] + - [151, 6920.0] + - - [2047, 480, 1, 2048, 2047, 2047, 2048, 2048] + - [154, 7896.0] + - - [2048, 479, 1, 2048, 2048, 2048, 2048, 2048] + - [154, 7879.0] + - - [2048, 480, 1, 2047, 2048, 2048, 2047, 2047] + - [109, 8207.0] + - - [2048, 480, 1, 2049, 2048, 2048, 2049, 2049] + - [121, 8235.0] + - - [2048, 481, 1, 2048, 2048, 2048, 2048, 2048] + - [95, 8008.0] + - - [2049, 480, 1, 2048, 2049, 2049, 2048, 2048] + - [95, 7989.0] + - - [3071, 480, 1, 3072, 3071, 3071, 3072, 3072] + - [128, 9307.0] + - - [480, 1024, 1, 1024, 480, 480, 1024, 1024] + - [90, 7012.0] + - - [480, 2048, 1, 2048, 480, 480, 2048, 2048] + - [95, 8108.0] + - - [1024, 480, 1, 1024, 1024, 1024, 1024, 1024] + - [157, 6831.0] + - - [2048, 480, 1, 2048, 2048, 2048, 2048, 2048] + - [95, 7933.0] + - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] + - [128, 7744.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [95, 8804.0] + - - [1024, 960, 1, 1600, 1024, 1024, 1600, 1600] + - [84, 9228.0] + - - [1024, 1024, 1, 960, 1024, 1024, 960, 960] + - [138, 9135.0] + - - [2048, 215, 1, 512, 2048, 2048, 512, 512] + - [121, 5889.0] + - - [2048, 215, 1, 768, 2048, 2048, 768, 768] + - [121, 6106.0] + - - [2048, 256, 1, 512, 2048, 2048, 512, 512] + - [90, 7185.0] + - - [2048, 256, 1, 768, 2048, 2048, 768, 768] + - [121, 7388.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [95, 8522.0] + - - [2048, 512, 1, 67, 2048, 2048, 67, 67] + - [107, 4913.0] + - - [2048, 512, 1, 74, 2048, 2048, 74, 74] + - [73, 5389.0] + - - [256, 1280, 1, 1024, 256, 256, 1024, 1024] + - [128, 7869.0] + - - [256, 1536, 1, 1024, 256, 256, 1024, 1024] + - [95, 6763.0] + - - [256, 2304, 1, 1024, 256, 256, 1024, 1024] + - [90, 8550.0] + - - [256, 2560, 1, 1024, 256, 256, 1024, 1024] + - [97, 9121.0] + - - [256, 2816, 1, 1024, 256, 256, 1024, 1024] + - [154, 7774.0] + - - [256, 3328, 1, 1024, 256, 256, 1024, 1024] + - [128, 8511.0] + - - [256, 3584, 1, 1024, 256, 256, 1024, 1024] + - [128, 9313.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [126, 8306.0] + - - [767, 1280, 1, 768, 767, 767, 768, 768] + - [128, 9425.0] + - - [769, 1280, 1, 768, 769, 769, 768, 768] + - [95, 7739.0] + - - [768, 1279, 1, 768, 768, 768, 768, 768] + - [97, 9261.0] + - - [768, 1281, 1, 768, 768, 768, 768, 768] + - [157, 7708.0] + - - [768, 1280, 1, 767, 768, 768, 767, 767] + - [78, 9892.0] + - - [768, 1280, 1, 769, 768, 768, 769, 769] + - [109, 9944.0] + - - [256, 4096, 1, 512, 256, 256, 512, 512] + - [154, 8142.0] + - - [767, 768, 1, 768, 767, 767, 768, 768] + - [90, 8142.0] + - - [769, 768, 1, 768, 769, 769, 768, 768] + - [92, 7061.0] + - - [768, 767, 1, 768, 768, 768, 768, 768] + - [157, 7984.0] + - - [768, 769, 1, 768, 768, 768, 768, 768] + - [154, 7675.0] + - - [768, 768, 1, 767, 768, 768, 767, 767] + - [139, 8588.0] + - - [768, 768, 1, 769, 768, 768, 769, 769] + - [78, 8555.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [97, 8121.0] + - - [128, 128, 49, 1152, 128, 128, 1152, 1152] + - [151, 8669.0] + - - [128, 128, 49, 1216, 128, 128, 1216, 1216] + - [90, 8844.0] + - - [128, 128, 36, 1800, 128, 128, 1800, 1800] + - [121, 9505.0] + - - [128, 128, 36, 1900, 128, 128, 1900, 1900] + - [145, 7283.0] + - - [128, 128, 64, 5880, 128, 128, 5880, 5880] + - [112, 8828.0] + - - [128, 128, 49, 7680, 128, 128, 7680, 7680] + - [134, 5608.0] + - - [128, 128, 64, 882, 128, 128, 882, 882] + - [78, 8710.0] + - - [128, 128, 64, 931, 128, 128, 931, 931] + - [107, 8755.0] + - - [128, 64, 121, 1152, 128, 128, 1152, 1152] + - [131, 5625.0] + - - [128, 64, 81, 12000, 128, 128, 12000, 12000] + - [114, 3443.0] + - - [128, 64, 121, 1216, 128, 128, 1216, 1216] + - [154, 6663.0] + - - [128, 64, 81, 1800, 128, 128, 1800, 1800] + - [95, 6490.0] + - - [128, 64, 81, 1900, 128, 128, 1900, 1900] + - [95, 6939.0] + - - [128, 64, 49, 20280, 128, 128, 20280, 20280] + - [83, 4935.0] + - - [128, 64, 49, 3042, 128, 128, 3042, 3042] + - [154, 6427.0] + - - [128, 64, 49, 3211, 128, 128, 3211, 3211] + - [95, 6600.0] + - - [128, 64, 169, 5880, 128, 128, 5880, 5880] + - [144, 3592.0] + - - [128, 64, 121, 7680, 128, 128, 7680, 7680] + - [129, 2130.0] + - - [128, 64, 169, 882, 128, 128, 882, 882] + - [106, 973.0] + - - [128, 64, 169, 931, 128, 128, 931, 931] + - [154, 5458.0] + - - [256, 128, 25, 1080, 256, 256, 1080, 1080] + - [76, 9861.0] + - - [256, 128, 25, 162, 256, 256, 162, 162] + - [113, 5893.0] + - - [256, 128, 25, 171, 256, 256, 171, 171] + - [76, 6558.0] + - - [1152, 256, 1, 1, 1152, 1152, 1, 1] + - [122, 69.0] + - - [1152, 256, 1, 1444, 1152, 1152, 1444, 1444] + - [78, 8044.0] + - - [1152, 256, 1, 25, 1152, 1152, 25, 25] + - [71, 2348.0] + - - [1152, 256, 1, 9, 1152, 1152, 9, 9] + - [135, 551.0] + - - [2304, 256, 1, 1444, 2304, 2304, 1444, 1444] + - [139, 9264.0] + - - [2304, 340, 1, 1, 2304, 2304, 1, 1] + - [132, 120.0] + - - [2304, 340, 1, 1444, 2304, 2304, 1444, 1444] + - [151, 8572.0] + - - [2304, 340, 1, 9, 2304, 2304, 9, 9] + - [135, 996.0] + - - [2304, 510, 1, 25, 2304, 2304, 25, 25] + - [116, 2825.0] + - - [30522, 77, 1, 1024, 30522, 30522, 1024, 1024] + - [97, 6207.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [121, 7147.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [97, 8069.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 8328.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 6554.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [128, 6858.0] + - - [64, 512, 256, 512, 64, 64, 512, 512] + - [156, 6755.0] + - - [64, 512, 128, 512, 64, 64, 512, 512] + - [96, 6794.0] + - - [64, 512, 40, 512, 64, 64, 512, 512] + - [80, 4129.0] + - - [96, 1024, 64, 1024, 96, 96, 1024, 1024] + - [95, 6169.0] + - - [96, 1024, 128, 1024, 96, 96, 1024, 1024] + - [154, 3645.0] + - - [64, 1024, 256, 1024, 64, 64, 1024, 1024] + - [94, 3807.0] + - - [64, 1024, 32, 1024, 64, 64, 1024, 1024] + - [129, 6248.0] + - - [64, 1024, 64, 1024, 64, 64, 1024, 1024] + - [94, 6388.0] + - - [64, 1024, 128, 1024, 64, 64, 1024, 1024] + - [130, 3988.0] + - - [64, 128, 1024, 128, 64, 64, 128, 128] + - [96, 5897.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [90, 7575.0] + - - [1024, 864, 1, 480, 1024, 1024, 480, 480] + - [109, 8572.0] + - - [128, 3456, 1, 256, 128, 128, 256, 256] + - [119, 6056.0] + - - [128, 4096, 1, 256, 128, 128, 256, 256] + - [103, 4828.0] + - - [128, 6912, 1, 256, 128, 128, 256, 256] + - [128, 4290.0] + - - [256, 3456, 1, 512, 256, 256, 512, 512] + - [126, 6559.0] + - - [512, 864, 1, 1024, 512, 512, 1024, 1024] + - [126, 7299.0] + - - [512, 864, 1, 13, 512, 512, 13, 13] + - [108, 998.0] + - - [64, 128, 1280, 128, 64, 64, 128, 128] + - [133, 5475.0] + - - [64, 128, 1312, 128, 64, 64, 128, 128] + - [102, 5465.0] + - - [64, 512, 192, 512, 64, 64, 512, 512] + - [127, 6734.0] + - - [1024, 512, 1, 196, 1024, 1024, 196, 196] + - [76, 6328.0] + - - [64, 128, 2048, 128, 64, 64, 128, 128] + - [130, 5432.0] + - - [64, 128, 1536, 128, 64, 64, 128, 128] + - [99, 5430.0] + - - [128, 128, 64, 6400, 128, 128, 6400, 6400] + - [134, 6398.0] + - - [64, 128, 192, 128, 64, 64, 128, 128] + - [96, 5389.0] + - - [64, 384, 144, 384, 64, 64, 384, 384] + - [93, 1093.0] + - - [64, 512, 48, 512, 64, 64, 512, 512] + - [101, 3549.0] + - - [64, 128, 256, 128, 64, 64, 128, 128] + - [156, 5194.0] + - - [64, 384, 192, 384, 64, 64, 384, 384] + - [156, 7025.0] + - - [128, 128, 49, 1120, 128, 128, 1120, 1120] + - [78, 8764.0] + - - [128, 128, 49, 1064, 128, 128, 1064, 1064] + - [119, 9542.0] + - - [128, 128, 49, 1040, 128, 128, 1040, 1040] + - [119, 9512.0] + - - [128, 128, 64, 600, 128, 128, 600, 600] + - [90, 8474.0] + - - [128, 128, 64, 616, 128, 128, 616, 616] + - [138, 8822.0] + - - [128, 128, 49, 950, 128, 128, 950, 950] + - [107, 8891.0] + - - [128, 128, 49, 972, 128, 128, 972, 972] + - [89, 9176.0] + - - [128, 128, 64, 560, 128, 128, 560, 560] + - [76, 8613.0] + - - [128, 128, 49, 1008, 128, 128, 1008, 1008] + - [89, 9478.0] + - - [128, 128, 64, 532, 128, 128, 532, 532] + - [121, 8468.0] + - - [128, 128, 49, 1080, 128, 128, 1080, 1080] + - [119, 9447.0] + - - [128, 128, 64, 588, 128, 128, 588, 588] + - [90, 8428.0] + - - [128, 128, 49, 1160, 128, 128, 1160, 1160] + - [151, 8812.0] + - - [128, 128, 49, 988, 128, 128, 988, 988] + - [89, 9334.0] + - - [128, 128, 49, 936, 128, 128, 936, 936] + - [151, 8661.0] + - - [512, 1024, 1, 3800, 512, 512, 3800, 3800] + - [78, 8674.0] + - - [512, 1024, 1, 3400, 512, 512, 3400, 3400] + - [151, 8615.0] + - - [512, 1024, 1, 3456, 512, 512, 3456, 3456] + - [78, 8480.0] + - - [2048, 512, 1, 950, 2048, 2048, 950, 950] + - [119, 9094.0] + - - [512, 1024, 1, 3552, 512, 512, 3552, 3552] + - [121, 8697.0] + - - [512, 1024, 1, 3220, 512, 512, 3220, 3220] + - [109, 8707.0] + - - [2048, 512, 1, 850, 2048, 2048, 850, 850] + - [119, 9138.0] + - - [512, 2048, 1, 864, 512, 512, 864, 864] + - [107, 9361.0] + - - [512, 2048, 1, 768, 512, 512, 768, 768] + - [95, 8359.0] + - - [2048, 512, 1, 805, 2048, 2048, 805, 805] + - [138, 9059.0] + - - [512, 1024, 1, 2852, 512, 512, 2852, 2852] + - [109, 8571.0] + - - [512, 2048, 1, 888, 512, 512, 888, 888] + - [76, 9183.0] + - - [2048, 512, 1, 864, 2048, 2048, 864, 864] + - [150, 9258.0] + - - [2048, 512, 1, 888, 2048, 2048, 888, 888] + - [138, 8983.0] + - - [2048, 256, 1, 950, 2048, 2048, 950, 950] + - [139, 8000.0] + - - [2048, 512, 1, 713, 2048, 2048, 713, 713] + - [150, 9010.0] + - - [512, 1024, 1, 2688, 512, 512, 2688, 2688] + - [109, 8418.0] + - - [512, 1024, 1, 2640, 512, 512, 2640, 2640] + - [109, 8592.0] + - - [512, 1024, 1, 2904, 512, 512, 2904, 2904] + - [109, 8605.0] + - - [1024, 512, 1, 950, 1024, 1024, 950, 950] + - [139, 8010.0] + - - [512, 2048, 1, 672, 512, 512, 672, 672] + - [107, 9151.0] + - - [512, 2048, 1, 660, 512, 512, 660, 660] + - [76, 9193.0] + - - [512, 2048, 1, 1008, 512, 512, 1008, 1008] + - [107, 9415.0] + - - [2048, 256, 1, 850, 2048, 2048, 850, 850] + - [78, 7941.0] + - - [2048, 512, 1, 726, 2048, 2048, 726, 726] + - [138, 9028.0] + - - [1024, 512, 1, 850, 1024, 1024, 850, 850] + - [139, 7885.0] + - - [2048, 512, 1, 660, 2048, 2048, 660, 660] + - [138, 8923.0] + - - [2048, 512, 1, 672, 2048, 2048, 672, 672] + - [150, 9106.0] + - - [512, 2048, 1, 840, 512, 512, 840, 840] + - [76, 9260.0] + - - [2048, 512, 1, 1008, 2048, 2048, 1008, 1008] + - [107, 9377.0] + - - [512, 2048, 1, 792, 512, 512, 792, 792] + - [107, 9277.0] + - - [1024, 512, 1, 805, 1024, 1024, 805, 805] + - [78, 8154.0] + - - [512, 2048, 1, 1050, 512, 512, 1050, 1050] + - [107, 9339.0] + - - [2048, 512, 1, 748, 2048, 2048, 748, 748] + - [138, 9063.0] + - - [2048, 256, 1, 864, 2048, 2048, 864, 864] + - [109, 8054.0] + - - [1024, 512, 1, 864, 1024, 1024, 864, 864] + - [139, 8020.0] + - - [2048, 512, 1, 875, 2048, 2048, 875, 875] + - [107, 9073.0] + - - [2048, 512, 1, 840, 2048, 2048, 840, 840] + - [150, 9185.0] + - - [2048, 512, 1, 792, 2048, 2048, 792, 792] + - [138, 9230.0] + - - [512, 2048, 1, 736, 512, 512, 736, 736] + - [76, 9265.0] + - - [2048, 256, 1, 888, 2048, 2048, 888, 888] + - [109, 8058.0] + - - [512, 2048, 1, 704, 512, 512, 704, 704] + - [76, 9202.0] + - - [512, 2048, 1, 588, 512, 512, 588, 588] + - [107, 9080.0] + - - [1024, 512, 1, 888, 1024, 1024, 888, 888] + - [109, 7991.0] + - - [512, 2048, 1, 816, 512, 512, 816, 816] + - [107, 9294.0] + - - [1024, 512, 1, 713, 1024, 1024, 713, 713] + - [109, 7733.0] + - - [2048, 512, 1, 736, 2048, 2048, 736, 736] + - [138, 9220.0] + - - [2048, 512, 1, 588, 2048, 2048, 588, 588] + - [150, 8846.0] + - - [2048, 512, 1, 704, 2048, 2048, 704, 704] + - [150, 9157.0] + - - [1024, 512, 1, 660, 1024, 1024, 660, 660] + - [109, 7635.0] + - - [2048, 256, 1, 660, 2048, 2048, 660, 660] + - [109, 7790.0] + - - [2048, 256, 1, 672, 2048, 2048, 672, 672] + - [139, 7840.0] + - - [1024, 512, 1, 672, 1024, 1024, 672, 672] + - [109, 7809.0] + - - [1024, 512, 1, 726, 1024, 1024, 726, 726] + - [78, 7781.0] + - - [512, 2048, 1, 630, 512, 512, 630, 630] + - [107, 9079.0] + - - [512, 2048, 1, 600, 512, 512, 600, 600] + - [76, 9147.0] + - - [2048, 256, 1, 805, 2048, 2048, 805, 805] + - [109, 7933.0] + - - [2048, 256, 1, 713, 2048, 2048, 713, 713] + - [109, 7768.0] + - - [2048, 256, 1, 726, 2048, 2048, 726, 726] + - [139, 7845.0] + - - [320, 1024, 1, 1024, 320, 320, 1024, 1024] + - [92, 7023.0] + - - [1024, 1000, 1, 1024, 1024, 1024, 1024, 1024] + - [126, 8090.0] + - - [320, 1000, 1, 1024, 320, 320, 1024, 1024] + - [123, 6810.0] + - - [128, 128, 49, 1280, 128, 128, 1280, 1280] + - [154, 8686.0] + - - [128, 128, 49, 1360, 128, 128, 1360, 1360] + - [113, 7376.0] + - - [128, 128, 49, 1200, 128, 128, 1200, 1200] + - [76, 9170.0] + - - [128, 128, 49, 1240, 128, 128, 1240, 1240] + - [151, 8860.0] + - - [2304, 256, 1, 704, 2304, 2304, 704, 704] + - [139, 8611.0] + - - [2304, 256, 1, 736, 2304, 2304, 736, 736] + - [109, 8756.0] + - - [2304, 256, 1, 792, 2304, 2304, 792, 792] + - [109, 8922.0] + - - [2304, 256, 1, 748, 2304, 2304, 748, 748] + - [109, 8750.0] + - - [2304, 256, 1, 726, 2304, 2304, 726, 726] + - [109, 8672.0] + - - [2304, 256, 1, 713, 2304, 2304, 713, 713] + - [121, 8929.0] + - - [2304, 256, 1, 768, 2304, 2304, 768, 768] + - [90, 8159.0] + - - [512, 2048, 1, 759, 512, 512, 759, 759] + - [76, 9152.0] + - - [512, 2048, 1, 925, 512, 512, 925, 925] + - [76, 9237.0] + - - [2304, 256, 1, 805, 2304, 2304, 805, 805] + - [109, 8767.0] + - - [512, 2048, 1, 900, 512, 512, 900, 900] + - [76, 9270.0] + - - [512, 2048, 1, 875, 512, 512, 875, 875] + - [76, 9258.0] + - - [512, 2048, 1, 748, 512, 512, 748, 748] + - [76, 9199.0] + - - [512, 2048, 1, 726, 512, 512, 726, 726] + - [138, 9284.0] + - - [512, 2048, 1, 713, 512, 512, 713, 713] + - [107, 9129.0] + - - [512, 2048, 1, 805, 512, 512, 805, 805] + - [76, 9229.0] + - - [512, 2048, 1, 850, 512, 512, 850, 850] + - [107, 9198.0] + - - [512, 2048, 1, 950, 512, 512, 950, 950] + - [76, 9312.0] + - - [96, 1024, 160, 1024, 96, 96, 1024, 1024] + - [95, 3849.0] + - - [96, 1024, 40, 1024, 96, 96, 1024, 1024] + - [97, 7690.0] + - - [96, 1024, 80, 1024, 96, 96, 1024, 1024] + - [95, 6221.0] + - - [96, 1024, 96, 1024, 96, 96, 1024, 1024] + - [128, 4532.0] + - - [96, 1024, 24, 1024, 96, 96, 1024, 1024] + - [157, 7141.0] + - - [96, 1024, 48, 1024, 96, 96, 1024, 1024] + - [157, 7533.0] + - - [96, 1024, 16, 1024, 96, 96, 1024, 1024] + - [128, 5032.0] + - - [96, 1024, 32, 1024, 96, 96, 1024, 1024] + - [157, 7587.0] + - - [64, 512, 320, 512, 64, 64, 512, 512] + - [99, 5431.0] + - - [64, 512, 80, 512, 64, 64, 512, 512] + - [127, 6416.0] + - - [29000, 109, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 7506.0] + - - [29000, 121, 1, 2560, 29000, 29000, 2560, 2560] + - [103, 8058.0] + - - [29000, 65, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 4616.0] + - - [29000, 66, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 4711.0] + - - [29000, 67, 1, 2560, 29000, 29000, 2560, 2560] + - [97, 4712.0] + - - [29000, 69, 1, 2560, 29000, 29000, 2560, 2560] + - [97, 5165.0] + - - [29000, 70, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 5030.0] + - - [29000, 71, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 4969.0] + - - [29000, 73, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 4929.0] + - - [29000, 74, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5343.0] + - - [29000, 75, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5353.0] + - - [29000, 77, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 5398.0] + - - [29000, 78, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 5640.0] + - - [29000, 80, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5575.0] + - - [29000, 81, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5662.0] + - - [29000, 82, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5657.0] + - - [29000, 83, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 5791.0] + - - [29000, 84, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 5975.0] + - - [29000, 88, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 6104.0] + - - [29000, 89, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 6421.0] + - - [29000, 90, 1, 2560, 29000, 29000, 2560, 2560] + - [157, 6241.0] + - - [29000, 92, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 6249.0] + - - [29000, 95, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 6502.0] + - - [29000, 98, 1, 2560, 29000, 29000, 2560, 2560] + - [128, 6664.0] + - - [64, 1024, 512, 1024, 64, 64, 1024, 1024] + - [158, 4759.0] + - - [1024, 200, 1, 13312, 1024, 1024, 13312, 13312] + - [167, 6368.0] + - - [1024, 256, 1, 15360, 1024, 1024, 15360, 15360] + - [192, 6119.0] + - - [1024, 256, 1, 16384, 1024, 1024, 16384, 16384] + - [184, 5693.0] + - - [1024, 200, 1, 16384, 1024, 1024, 16384, 16384] + - [170, 4780.0] + - - [1024, 256, 1, 12288, 1024, 1024, 12288, 12288] + - [167, 8032.0] + - - [1024, 200, 1, 12288, 1024, 1024, 12288, 12288] + - [167, 6599.0] + - - [1024, 200, 1, 15360, 1024, 1024, 15360, 15360] + - [168, 4996.0] + - - [1024, 256, 1, 9216, 1024, 1024, 9216, 9216] + - [167, 8163.0] + - - [1024, 200, 1, 14336, 1024, 1024, 14336, 14336] + - [181, 6027.0] + - - [1024, 256, 1, 16640, 1024, 1024, 16640, 16640] + - [174, 6550.0] + - - [1024, 200, 1, 8192, 1024, 1024, 8192, 8192] + - [167, 6403.0] + - - [1024, 200, 1, 10240, 1024, 1024, 10240, 10240] + - [167, 6616.0] + - - [1024, 200, 1, 9216, 1024, 1024, 9216, 9216] + - [167, 6568.0] + - - [1024, 256, 1, 11264, 1024, 1024, 11264, 11264] + - [167, 8336.0] + - - [1024, 200, 1, 8320, 1024, 1024, 8320, 8320] + - [162, 6593.0] + - - [1024, 256, 1, 8320, 1024, 1024, 8320, 8320] + - [162, 8289.0] + - - [1024, 200, 1, 16640, 1024, 1024, 16640, 16640] + - [163, 5589.0] + - - [1024, 256, 1, 14336, 1024, 1024, 14336, 14336] + - [190, 6691.0] + - - [1024, 256, 1, 13312, 1024, 1024, 13312, 13312] + - [190, 7126.0] + - - [1024, 200, 1, 11264, 1024, 1024, 11264, 11264] + - [167, 6656.0] + - - [1024, 256, 1, 8192, 1024, 1024, 8192, 8192] + - [167, 8093.0] + - - [1024, 256, 1, 10240, 1024, 1024, 10240, 10240] + - [167, 7810.0] + - - [96, 64, 64, 18432, 96, 96, 18432, 18432] + - [182, 1939.0] + - - [96, 64, 36, 10368, 96, 96, 10368, 10368] + - [154, 5119.0] + - - [96, 64, 36, 20736, 96, 96, 20736, 20736] + - [160, 4631.0] + - - [96, 96, 36, 10368, 96, 96, 10368, 10368] + - [186, 5143.0] + - - [96, 64, 49, 28800, 96, 96, 28800, 28800] + - [141, 3038.0] + - - [96, 64, 36, 41472, 96, 96, 41472, 41472] + - [131, 2543.0] + - - [64, 64, 11, 233600, 64, 64, 233600, 233600] + - [164, 2300.0] + - - [64, 64, 11, 116800, 64, 64, 116800, 116800] + - [166, 3301.0] + - - [64, 64, 9, 172864, 64, 64, 172864, 172864] + - [187, 3943.0] + - - [64, 64, 11, 58400, 64, 64, 58400, 58400] + - [179, 4928.0] + - - [192, 160, 9, 19584, 192, 192, 19584, 19584] + - [175, 5194.0] + - - [128, 128, 9, 9792, 128, 128, 9792, 9792] + - [174, 8073.0] + - - [192, 160, 11, 13056, 192, 192, 13056, 13056] + - [125, 4693.0] + - - [64, 64, 9, 86432, 64, 64, 86432, 86432] + - [173, 5407.0] + - - [128, 128, 9, 19584, 128, 128, 19584, 19584] + - [170, 7630.0] + - - [160, 160, 11, 13056, 160, 160, 13056, 13056] + - [125, 4403.0] + - - [160, 160, 9, 19584, 160, 160, 19584, 19584] + - [175, 4197.0] + - - [192, 128, 9, 19584, 192, 192, 19584, 19584] + - [191, 5960.0] + - - [192, 160, 9, 9792, 192, 192, 9792, 9792] + - [178, 6302.0] + - - [64, 64, 9, 345728, 64, 64, 345728, 345728] + - [176, 2776.0] + - - [128, 128, 11, 13056, 128, 128, 13056, 13056] + - [184, 6221.0] + - - [160, 160, 9, 9792, 160, 160, 9792, 9792] + - [165, 5430.0] + - - [192, 128, 11, 13056, 192, 192, 13056, 13056] + - [189, 6035.0] + - - [192, 128, 9, 9792, 192, 192, 9792, 9792] + - [172, 7245.0] + - - [128, 64, 25, 43320, 128, 128, 43320, 43320] + - [82, 3707.0] + - - [64, 64, 64, 20280, 64, 64, 20280, 20280] + - [152, 3968.0] + - - [64, 64, 49, 27000, 64, 64, 27000, 27000] + - [124, 3928.0] + - - [64, 64, 36, 43320, 64, 64, 43320, 43320] + - [166, 3902.0] + - - [64, 64, 36, 50176, 64, 64, 50176, 50176] + - [129, 2082.0] + - - [64, 64, 49, 36864, 64, 64, 36864, 36864] + - [101, 2930.0] + - - [64, 64, 64, 25600, 64, 64, 25600, 25600] + - [129, 2004.0] + - - [256, 256, 1, 60800, 256, 256, 60800, 60800] + - [180, 6609.0] + - - [256, 256, 1, 54400, 256, 256, 54400, 54400] + - [184, 6337.0] + - - [256, 256, 1, 51520, 256, 256, 51520, 51520] + - [181, 7417.0] + - - [256, 256, 1, 55296, 256, 256, 55296, 55296] + - [169, 2723.0] + - - [256, 256, 1, 56832, 256, 256, 56832, 56832] + - [177, 5787.0] + - - [256, 256, 1, 45632, 256, 256, 45632, 45632] + - [186, 7373.0] + - - [256, 256, 1, 49152, 256, 256, 49152, 49152] + - [169, 3434.0] + - - [256, 512, 1, 13600, 256, 256, 13600, 13600] + - [171, 7769.0] + - - [256, 256, 1, 43008, 256, 256, 43008, 43008] + - [183, 5346.0] + - - [256, 512, 1, 15200, 256, 256, 15200, 15200] + - [185, 7798.0] + - - [256, 512, 1, 12880, 256, 256, 12880, 12880] + - [188, 7921.0] + - - [256, 512, 1, 13824, 256, 256, 13824, 13824] + - [181, 7454.0] + - - [512, 256, 1, 13824, 512, 512, 13824, 13824] + - [190, 7604.0] + - - [256, 512, 1, 14208, 256, 256, 14208, 14208] + - [190, 7495.0] + - - [512, 256, 1, 14208, 512, 512, 14208, 14208] + - [181, 7705.0] + - - [512, 256, 1, 15200, 512, 512, 15200, 15200] + - [185, 8226.0] + - - [256, 512, 1, 12288, 256, 256, 12288, 12288] + - [190, 7330.0] + - - [512, 256, 1, 12288, 512, 512, 12288, 12288] + - [181, 7499.0] + - - [1024, 200, 1, 560, 1024, 1024, 560, 560] + - [196, 2924.0] + - - [768, 320, 1, 768, 768, 768, 768, 768] + - [234, 3906.0] + - - [1024, 120, 1, 1024, 1024, 1024, 1024, 1024] + - [211, 3020.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [256, 1907.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [256, 3805.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [253, 2447.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [209, 2816.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [209, 3189.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [234, 2602.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [209, 3373.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [255, 3147.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [209, 2570.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [211, 3563.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [234, 3901.0] + - - [704, 256, 1, 128, 704, 704, 128, 128] + - [245, 2128.0] + - - [128, 1408, 1, 128, 128, 128, 128, 128] + - [196, 2090.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [255, 2786.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [238, 2461.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [211, 3323.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [209, 3188.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [211, 2898.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [198, 3178.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [209, 2536.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [255, 3253.0] + - - [128, 1024, 1, 128, 128, 128, 128, 128] + - [256, 1942.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [209, 3445.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [230, 2484.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [256, 4044.0] + - - [35, 8457, 1, 1760, 35, 35, 1760, 1760] + - [223, 2227.0] + - - [64, 2944, 1, 128, 64, 64, 128, 128] + - [245, 2177.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [209, 3147.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [209, 3023.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [234, 4020.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [211, 4062.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [232, 2349.0] + - - [128, 1856, 1, 128, 128, 128, 128, 128] + - [247, 2749.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [232, 3075.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [209, 2532.0] + - - [35, 8457, 1, 2560, 35, 35, 2560, 2560] + - [234, 1942.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 128] + - [234, 2502.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [209, 3309.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [256, 3619.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [211, 2775.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [243, 2598.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [234, 3467.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [245, 2124.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [256, 3169.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 128] + - [221, 2173.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [211, 3319.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [209, 1828.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [255, 3112.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [234, 3913.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [211, 3143.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 128] + - [211, 2589.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [211, 3427.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [256, 3890.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [211, 3576.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [255, 2365.0] + - - [35, 8457, 1, 2048, 35, 35, 2048, 2048] + - [211, 2241.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [209, 2546.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 128] + - [211, 2266.0] + - - [256, 1024, 1, 128, 256, 256, 128, 128] + - [223, 2504.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [255, 3158.0] + - - [35, 8457, 1, 4096, 35, 35, 4096, 4096] + - [234, 1884.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [196, 2775.0] + - - [448, 256, 1, 128, 448, 448, 128, 128] + - [221, 1744.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [209, 2469.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [211, 2524.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [209, 3192.0] + - - [128, 704, 1, 128, 128, 128, 128, 128] + - [196, 1396.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [221, 2607.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [209, 3114.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [211, 3087.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [256, 3822.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 128] + - [255, 1841.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [211, 3405.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [223, 3815.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [207, 1390.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [209, 3120.0] + - - [256, 448, 1, 128, 256, 256, 128, 128] + - [196, 1748.0] + - - [64, 3584, 1, 128, 64, 64, 128, 128] + - [256, 2622.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [232, 3309.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [256, 2460.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [209, 3144.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [232, 3177.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [255, 1819.0] + - - [64, 1856, 1, 128, 64, 64, 128, 128] + - [209, 1845.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [256, 2836.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [256, 2785.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [211, 3798.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [211, 2954.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [211, 3481.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [255, 2577.0] + - - [64, 1408, 1, 128, 64, 64, 128, 128] + - [230, 1393.0] + - - [256, 704, 1, 128, 256, 256, 128, 128] + - [196, 2101.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 128] + - [209, 1360.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [256, 3334.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [256, 2460.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [256, 3904.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [211, 3442.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [255, 3340.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [255, 2297.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [209, 3062.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [255, 1843.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [234, 3909.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [234, 3081.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [196, 3371.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [256, 3362.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [221, 3109.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [232, 2601.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [234, 3295.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [211, 2997.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [209, 2368.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [232, 3069.0] + - - [64, 2368, 1, 128, 64, 64, 128, 128] + - [247, 2199.0] + - - [64, 4288, 1, 128, 64, 64, 128, 128] + - [211, 2579.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [256, 3298.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [256, 3216.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [232, 1860.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [209, 3230.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [196, 3115.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [234, 4039.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [209, 3093.0] + - - [448, 448, 1, 128, 448, 448, 128, 128] + - [223, 2960.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [211, 2869.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [253, 2786.0] + - - [1024, 256, 1, 1536, 1024, 1024, 1536, 1536] + - [256, 3354.0] + - - [1024, 200, 1, 1408, 1024, 1024, 1408, 1408] + - [245, 2973.0] + - - [1024, 200, 1, 6144, 1024, 1024, 6144, 6144] + - [255, 2988.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [256, 3424.0] + - - [512, 256, 1, 3200, 512, 512, 3200, 3200] + - [223, 3329.0] + - - [1024, 200, 1, 4608, 1024, 1024, 4608, 4608] + - [209, 3002.0] + - - [512, 256, 1, 1792, 512, 512, 1792, 1792] + - [211, 3220.0] + - - [1024, 200, 1, 1792, 1024, 1024, 1792, 1792] + - [232, 2927.0] + - - [512, 200, 1, 2816, 512, 512, 2816, 2816] + - [209, 2874.0] + - - [512, 200, 1, 3072, 512, 512, 3072, 3072] + - [232, 2892.0] + - - [1024, 200, 1, 128, 1024, 1024, 128, 128] + - [245, 2074.0] + - - [1024, 200, 1, 5120, 1024, 1024, 5120, 5120] + - [232, 2983.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [211, 2903.0] + - - [512, 256, 1, 2560, 512, 512, 2560, 2560] + - [211, 3246.0] + - - [1024, 256, 1, 4160, 1024, 1024, 4160, 4160] + - [247, 3463.0] + - - [1024, 200, 1, 512, 1024, 1024, 512, 512] + - [232, 2731.0] + - - [512, 512, 1, 1536, 512, 512, 1536, 1536] + - [234, 3368.0] + - - [1024, 256, 1, 896, 1024, 1024, 896, 896] + - [247, 3423.0] + - - [1024, 200, 1, 3200, 1024, 1024, 3200, 3200] + - [245, 3027.0] + - - [1024, 200, 1, 1536, 1024, 1024, 1536, 1536] + - [209, 2917.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 1024] + - [211, 3348.0] + - - [128, 1024, 1, 512, 128, 128, 512, 512] + - [211, 2846.0] + - - [1024, 256, 1, 5120, 1024, 1024, 5120, 5120] + - [256, 3532.0] + - - [1024, 200, 1, 2304, 1024, 1024, 2304, 2304] + - [209, 2943.0] + - - [1024, 256, 1, 1664, 1024, 1024, 1664, 1664] + - [256, 3460.0] + - - [512, 512, 1, 1024, 512, 512, 1024, 1024] + - [234, 3308.0] + - - [1024, 256, 1, 2080, 1024, 1024, 2080, 2080] + - [223, 3485.0] + - - [512, 200, 1, 768, 512, 512, 768, 768] + - [209, 2616.0] + - - [1024, 256, 1, 2816, 1024, 1024, 2816, 2816] + - [256, 3408.0] + - - [1024, 200, 1, 64, 1024, 1024, 64, 64] + - [221, 1614.0] + - - [512, 512, 1, 2304, 512, 512, 2304, 2304] + - [211, 3375.0] + - - [128, 1024, 1, 2048, 128, 128, 2048, 2048] + - [256, 3237.0] + - - [512, 200, 1, 2560, 512, 512, 2560, 2560] + - [209, 2871.0] + - - [512, 256, 1, 1024, 512, 512, 1024, 1024] + - [211, 3084.0] + - - [1024, 256, 1, 1920, 1024, 1024, 1920, 1920] + - [223, 3406.0] + - - [512, 200, 1, 2304, 512, 512, 2304, 2304] + - [209, 2865.0] + - - [1024, 256, 1, 384, 1024, 1024, 384, 384] + - [198, 3115.0] + - - [1024, 256, 1, 32, 1024, 1024, 32, 32] + - [232, 1389.0] + - - [1024, 200, 1, 2816, 1024, 1024, 2816, 2816] + - [209, 2955.0] + - - [1024, 200, 1, 3072, 1024, 1024, 3072, 3072] + - [255, 3038.0] + - - [512, 256, 1, 1536, 512, 512, 1536, 1536] + - [211, 3175.0] + - - [1024, 256, 1, 512, 1024, 1024, 512, 512] + - [211, 3161.0] + - - [256, 512, 1, 512, 256, 256, 512, 512] + - [211, 2841.0] + - - [1024, 200, 1, 3840, 1024, 1024, 3840, 3840] + - [255, 2937.0] + - - [256, 1024, 1, 512, 256, 256, 512, 512] + - [209, 3262.0] + - - [1024, 256, 1, 1152, 1024, 1024, 1152, 1152] + - [211, 3422.0] + - - [512, 512, 1, 2816, 512, 512, 2816, 2816] + - [234, 3406.0] + - - [512, 200, 1, 1280, 512, 512, 1280, 1280] + - [209, 2754.0] + - - [512, 200, 1, 3200, 512, 512, 3200, 3200] + - [245, 2964.0] + - - [1024, 256, 1, 2304, 1024, 1024, 2304, 2304] + - [234, 3408.0] + - - [1024, 256, 1, 6144, 1024, 1024, 6144, 6144] + - [256, 3430.0] + - - [1024, 200, 1, 2560, 1024, 1024, 2560, 2560] + - [232, 2971.0] + - - [1024, 256, 1, 5632, 1024, 1024, 5632, 5632] + - [234, 3429.0] + - - [512, 256, 1, 768, 512, 512, 768, 768] + - [211, 3036.0] + - - [1024, 256, 1, 3072, 1024, 1024, 3072, 3072] + - [223, 3432.0] + - - [256, 512, 1, 2048, 256, 256, 2048, 2048] + - [211, 3216.0] + - - [1024, 200, 1, 1152, 1024, 1024, 1152, 1152] + - [245, 2940.0] + - - [512, 512, 1, 3072, 512, 512, 3072, 3072] + - [234, 3400.0] + - - [1024, 200, 1, 1664, 1024, 1024, 1664, 1664] + - [221, 2993.0] + - - [1024, 200, 1, 32, 1024, 1024, 32, 32] + - [221, 1054.0] + - - [1024, 200, 1, 384, 1024, 1024, 384, 384] + - [245, 2675.0] + - - [512, 256, 1, 2304, 512, 512, 2304, 2304] + - [211, 3246.0] + - - [256, 512, 1, 1024, 256, 256, 1024, 1024] + - [211, 3091.0] + - - [1024, 200, 1, 3328, 1024, 1024, 3328, 3328] + - [209, 2980.0] + - - [1024, 200, 1, 2080, 1024, 1024, 2080, 2080] + - [221, 3021.0] + - - [512, 200, 1, 1792, 512, 512, 1792, 1792] + - [209, 2825.0] + - - [1024, 256, 1, 1792, 1024, 1024, 1792, 1792] + - [256, 3447.0] + - - [1024, 200, 1, 7168, 1024, 1024, 7168, 7168] + - [209, 2999.0] + - - [512, 256, 1, 3072, 512, 512, 3072, 3072] + - [234, 3291.0] + - - [1024, 200, 1, 2048, 1024, 1024, 2048, 2048] + - [232, 2960.0] + - - [512, 512, 1, 1280, 512, 512, 1280, 1280] + - [234, 3339.0] + - - [1024, 200, 1, 1280, 1024, 1024, 1280, 1280] + - [209, 2873.0] + - - [512, 200, 1, 512, 512, 512, 512, 512] + - [232, 2439.0] + - - [1024, 256, 1, 2560, 1024, 1024, 2560, 2560] + - [234, 3409.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 2860.0] + - - [1024, 256, 1, 3200, 1024, 1024, 3200, 3200] + - [247, 3430.0] + - - [512, 512, 1, 2560, 512, 512, 2560, 2560] + - [234, 3392.0] + - - [1024, 256, 1, 640, 1024, 1024, 640, 640] + - [256, 3380.0] + - - [1024, 256, 1, 3584, 1024, 1024, 3584, 3584] + - [211, 3402.0] + - - [512, 512, 1, 3200, 512, 512, 3200, 3200] + - [234, 3455.0] + - - [1024, 256, 1, 7680, 1024, 1024, 7680, 7680] + - [234, 3389.0] + - - [512, 200, 1, 1536, 512, 512, 1536, 1536] + - [209, 2781.0] + - - [512, 256, 1, 2816, 512, 512, 2816, 2816] + - [211, 3297.0] + - - [1024, 200, 1, 768, 1024, 1024, 768, 768] + - [232, 2837.0] + - - [512, 200, 1, 2048, 512, 512, 2048, 2048] + - [232, 2839.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 128] + - [211, 2512.0] + - - [1024, 200, 1, 4096, 1024, 1024, 4096, 4096] + - [232, 2984.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [234, 3429.0] + - - [1024, 200, 1, 896, 1024, 1024, 896, 896] + - [221, 3011.0] + - - [1024, 256, 1, 4608, 1024, 1024, 4608, 4608] + - [256, 3430.0] + - - [128, 1024, 1, 1024, 128, 128, 1024, 1024] + - [256, 3080.0] + - - [1024, 256, 1, 2048, 1024, 1024, 2048, 2048] + - [234, 3478.0] + - - [512, 256, 1, 1280, 512, 512, 1280, 1280] + - [211, 3125.0] + - - [256, 1024, 1, 2048, 256, 256, 2048, 2048] + - [211, 3397.0] + - - [512, 512, 1, 2048, 512, 512, 2048, 2048] + - [211, 3519.0] + - - [512, 256, 1, 512, 512, 512, 512, 512] + - [256, 2870.0] + - - [1024, 200, 1, 7680, 1024, 1024, 7680, 7680] + - [209, 2952.0] + - - [1024, 200, 1, 6656, 1024, 1024, 6656, 6656] + - [209, 3017.0] + - - [512, 200, 1, 1024, 512, 512, 1024, 1024] + - [209, 2694.0] + - - [1024, 256, 1, 3840, 1024, 1024, 3840, 3840] + - [256, 3401.0] + - - [512, 512, 1, 768, 512, 512, 768, 768] + - [211, 3269.0] + - - [1024, 256, 1, 64, 1024, 1024, 64, 64] + - [211, 1951.0] + - - [1024, 200, 1, 1920, 1024, 1024, 1920, 1920] + - [255, 2976.0] + - - [1024, 256, 1, 7168, 1024, 1024, 7168, 7168] + - [234, 3421.0] + - - [512, 512, 1, 1792, 512, 512, 1792, 1792] + - [211, 3371.0] + - - [1024, 200, 1, 256, 1024, 1024, 256, 256] + - [255, 2480.0] + - - [256, 1024, 1, 1024, 256, 256, 1024, 1024] + - [211, 3309.0] + - - [1024, 200, 1, 640, 1024, 1024, 640, 640] + - [245, 2840.0] + - - [1024, 200, 1, 4160, 1024, 1024, 4160, 4160] + - [255, 3051.0] + - - [1024, 200, 1, 5632, 1024, 1024, 5632, 5632] + - [209, 3003.0] + - - [1024, 256, 1, 6656, 1024, 1024, 6656, 6656] + - [234, 3428.0] + - - [1024, 256, 1, 768, 1024, 1024, 768, 768] + - [256, 3266.0] + - - [512, 256, 1, 2048, 512, 512, 2048, 2048] + - [234, 3269.0] + - - [1024, 200, 1, 3584, 1024, 1024, 3584, 3584] + - [232, 2994.0] + - - [1024, 256, 1, 1408, 1024, 1024, 1408, 1408] + - [223, 3500.0] + - - [1024, 256, 1, 4096, 1024, 1024, 4096, 4096] + - [234, 3407.0] + - - [1024, 128, 1, 289, 1024, 1024, 289, 289] + - [247, 2525.0] + - - [768, 192, 1, 289, 768, 768, 289, 289] + - [234, 2830.0] + - - [32, 32, 1984, 64, 32, 32, 64, 64] + - [200, 2274.0] + - - [54, 54, 1184, 64, 54, 54, 64, 64] + - [223, 2888.0] + - - [35, 35, 1808, 64, 35, 35, 64, 64] + - [247, 1285.0] + - - [45, 45, 1424, 64, 45, 45, 64, 64] + - [223, 2087.0] + - - [49, 49, 1296, 64, 49, 49, 64, 64] + - [198, 2426.0] + - - [59, 59, 1088, 64, 59, 59, 64, 64] + - [198, 3272.0] + - - [41, 41, 1552, 64, 41, 41, 64, 64] + - [211, 1746.0] + - - [38, 38, 1680, 64, 38, 38, 64, 64] + - [247, 1506.0] + - - [2048, 128, 1, 4096, 2048, 2048, 4096, 4096] + - [256, 3424.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [211, 3069.0] + - - [1152, 128, 1, 784, 1152, 1152, 784, 784] + - [198, 3457.0] + - - [864, 96, 1, 1225, 864, 864, 1225, 1225] + - [230, 2674.0] + - - [896, 192, 1, 289, 896, 896, 289, 289] + - [221, 2653.0] + - - [768, 128, 1, 289, 768, 768, 289, 289] + - [255, 2108.0] + - - [1344, 192, 1, 289, 1344, 1344, 289, 289] + - [223, 3010.0] + - - [384, 192, 1, 1225, 384, 384, 1225, 1225] + - [232, 2744.0] + - - [832, 192, 1, 49, 832, 832, 49, 49] + - [227, 1673.0] + - - [1280, 192, 1, 64, 1280, 1280, 64, 64] + - [219, 2376.0] + - - [512, 256, 1, 196, 512, 512, 196, 196] + - [211, 2234.0] + - - [864, 96, 1, 289, 864, 864, 289, 289] + - [253, 2292.0] + - - [896, 128, 1, 289, 896, 896, 289, 289] + - [255, 2838.0] + - - [1200, 64, 1, 1225, 1200, 1200, 1225, 1225] + - [245, 2787.0] + - - [1024, 256, 1, 289, 1024, 1024, 289, 289] + - [256, 3229.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [223, 3187.0] + - - [1120, 192, 1, 289, 1120, 1120, 289, 289] + - [198, 3456.0] + - - [800, 96, 1, 784, 800, 800, 784, 784] + - [221, 2952.0] + - - [864, 128, 1, 784, 864, 864, 784, 784] + - [209, 3070.0] + - - [1344, 224, 1, 289, 1344, 1344, 289, 289] + - [245, 2969.0] + - - [1152, 192, 1, 784, 1152, 1152, 784, 784] + - [198, 3564.0] + - - [800, 128, 1, 196, 800, 800, 196, 196] + - [196, 1890.0] + - - [864, 208, 1, 196, 864, 864, 196, 196] + - [245, 2364.0] + - - [720, 192, 1, 5041, 720, 720, 5041, 5041] + - [223, 3523.0] + - - [576, 192, 1, 3136, 576, 576, 3136, 3136] + - [245, 3227.0] + - - [832, 256, 1, 49, 832, 832, 49, 49] + - [198, 1904.0] + - - [1200, 128, 1, 49, 1200, 1200, 49, 49] + - [211, 1107.0] + - - [528, 256, 1, 196, 528, 528, 196, 196] + - [198, 2312.0] + - - [256, 512, 1, 784, 256, 256, 784, 784] + - [234, 3073.0] + - - [480, 192, 1, 196, 480, 480, 196, 196] + - [194, 1694.0] + - - [96, 64, 36, 2592, 96, 96, 2592, 2592] + - [194, 3036.0] + - - [96, 96, 36, 2592, 96, 96, 2592, 2592] + - [243, 2995.0] + - - [1024, 192, 1, 289, 1024, 1024, 289, 289] + - [221, 2875.0] + - - [528, 160, 1, 196, 528, 528, 196, 196] + - [254, 1583.0] + - - [512, 160, 1, 196, 512, 512, 196, 196] + - [255, 1788.0] + - - [768, 160, 1, 289, 768, 768, 289, 289] + - [196, 2566.0] + - - [64, 32, 36, 43808, 64, 64, 43808, 43808] + - [196, 1812.0] + - - [832, 160, 1, 49, 832, 832, 49, 49] + - [197, 704.0] + - - [2048, 64, 1, 1001, 2048, 2048, 1001, 1001] + - [211, 3054.0] + - - [2048, 128, 1, 1001, 2048, 2048, 1001, 1001] + - [223, 3248.0] + - - [1536, 64, 1, 1001, 1536, 1536, 1001, 1001] + - [196, 2640.0] + - - [96, 96, 49, 3136, 96, 96, 3136, 3136] + - [234, 2406.0] + - - [64, 32, 49, 57600, 64, 64, 57600, 57600] + - [236, 1267.0] + - - [96, 64, 49, 6272, 96, 96, 6272, 6272] + - [211, 3006.0] + - - [64, 32, 49, 115200, 64, 64, 115200, 115200] + - [215, 1529.0] + - - [96, 96, 64, 2304, 96, 96, 2304, 2304] + - [241, 1409.0] + - - [96, 96, 49, 6272, 96, 96, 6272, 6272] + - [258, 2282.0] + - - [96, 64, 36, 5184, 96, 96, 5184, 5184] + - [194, 2998.0] + - - [64, 32, 64, 40000, 64, 64, 40000, 40000] + - [255, 1369.0] + - - [96, 64, 64, 4608, 96, 96, 4608, 4608] + - [256, 2409.0] + - - [96, 96, 36, 5184, 96, 96, 5184, 5184] + - [243, 3054.0] + - - [96, 64, 64, 2304, 96, 96, 2304, 2304] + - [240, 2257.0] + - - [96, 64, 49, 3136, 96, 96, 3136, 3136] + - [234, 3149.0] + - - [64, 32, 36, 87616, 64, 64, 87616, 87616] + - [207, 1547.0] + - - [64, 32, 64, 80000, 64, 64, 80000, 80000] + - [260, 1605.0] + - - [96, 96, 64, 4608, 96, 96, 4608, 4608] + - [213, 1205.0] + - - [64, 32, 36, 175232, 64, 64, 175232, 175232] + - [260, 1687.0] + - - [128, 128, 11, 3264, 128, 128, 3264, 3264] + - [221, 3126.0] + - - [192, 128, 11, 6528, 192, 192, 6528, 6528] + - [198, 2760.0] + - - [128, 128, 11, 6528, 128, 128, 6528, 6528] + - [221, 3180.0] + - - [160, 160, 9, 4896, 160, 160, 4896, 4896] + - [253, 3052.0] + - - [192, 160, 11, 6528, 192, 192, 6528, 6528] + - [203, 2638.0] + - - [192, 128, 9, 4896, 192, 192, 4896, 4896] + - [211, 3833.0] + - - [128, 128, 9, 4896, 128, 128, 4896, 4896] + - [211, 3838.0] + - - [192, 128, 11, 3264, 192, 192, 3264, 3264] + - [247, 3560.0] + - - [160, 160, 11, 3264, 160, 160, 3264, 3264] + - [243, 3178.0] + - - [192, 160, 9, 4896, 192, 192, 4896, 4896] + - [245, 3551.0] + - - [192, 160, 11, 3264, 192, 192, 3264, 3264] + - [198, 3547.0] + - - [160, 160, 11, 6528, 160, 160, 6528, 6528] + - [198, 2385.0] + - - [4096, 64, 1, 1024, 4096, 4096, 1024, 1024] + - [256, 3276.0] + - - [49, 49, 160, 64, 49, 49, 64, 64] + - [211, 1756.0] + - - [54, 54, 592, 64, 54, 54, 64, 64] + - [223, 2665.0] + - - [59, 59, 512, 64, 59, 59, 64, 64] + - [234, 3005.0] + - - [104, 104, 16, 64, 104, 104, 64, 64] + - [198, 1288.0] + - - [32, 32, 624, 64, 32, 32, 64, 64] + - [225, 1899.0] + - - [32, 32, 992, 64, 32, 32, 64, 64] + - [200, 2092.0] + - - [35, 35, 384, 64, 35, 35, 64, 64] + - [247, 1124.0] + - - [35, 35, 904, 64, 35, 35, 64, 64] + - [234, 1240.0] + - - [38, 38, 320, 64, 38, 38, 64, 64] + - [218, 1296.0] + - - [38, 38, 840, 64, 38, 38, 64, 64] + - [256, 1453.0] + - - [41, 41, 312, 64, 41, 41, 64, 64] + - [211, 1654.0] + - - [41, 41, 776, 64, 41, 41, 64, 64] + - [198, 1665.0] + - - [45, 45, 392, 64, 45, 45, 64, 64] + - [256, 1846.0] + - - [45, 45, 712, 64, 45, 45, 64, 64] + - [247, 1980.0] + - - [49, 49, 648, 64, 49, 49, 64, 64] + - [223, 2300.0] + - - [54, 54, 200, 64, 54, 54, 64, 64] + - [209, 2232.0] + - - [59, 59, 544, 64, 59, 59, 64, 64] + - [256, 3116.0] + - - [91, 91, 40, 64, 91, 91, 64, 64] + - [243, 1896.0] + - - [91, 93, 40, 64, 91, 91, 64, 64] + - [194, 1931.0] + - - [93, 93, 40, 64, 93, 93, 64, 64] + - [194, 1959.0] + - - [102, 102, 56, 64, 102, 102, 64, 64] + - [198, 2100.0] + - - [103, 103, 16, 64, 103, 103, 64, 64] + - [221, 1246.0] + - - [103, 104, 16, 64, 103, 103, 64, 64] + - [256, 1267.0] + - - [112, 112, 16, 64, 112, 112, 64, 64] + - [247, 1504.0] + - - [112, 123, 16, 64, 112, 112, 64, 64] + - [223, 1625.0] + - - [119, 119, 32, 64, 119, 119, 64, 64] + - [198, 2339.0] + - - [119, 135, 32, 64, 119, 119, 64, 64] + - [221, 2217.0] + - - [123, 123, 16, 64, 123, 123, 64, 64] + - [223, 1777.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [211, 3172.0] + - - [513, 512, 1, 512, 513, 513, 512, 512] + - [211, 3146.0] + - - [512, 512, 1, 513, 512, 512, 513, 513] + - [223, 3175.0] + - - [512, 512, 1, 511, 512, 512, 511, 511] + - [198, 3177.0] + - - [512, 513, 1, 512, 512, 512, 512, 512] + - [211, 3149.0] + - - [512, 511, 1, 512, 512, 512, 512, 512] + - [234, 3152.0] + - - [511, 512, 1, 512, 511, 511, 512, 512] + - [211, 3140.0] + - - [479, 512, 1, 512, 479, 479, 512, 512] + - [211, 2957.0] + - - [480, 511, 1, 512, 480, 480, 512, 512] + - [256, 2966.0] + - - [480, 512, 1, 511, 480, 480, 511, 511] + - [198, 2977.0] + - - [480, 512, 1, 513, 480, 480, 513, 513] + - [198, 2968.0] + - - [480, 513, 1, 512, 480, 480, 512, 512] + - [211, 2962.0] + - - [481, 512, 1, 512, 481, 481, 512, 512] + - [211, 2974.0] + - - [511, 480, 1, 512, 511, 511, 512, 512] + - [209, 3264.0] + - - [512, 479, 1, 512, 512, 512, 512, 512] + - [232, 3215.0] + - - [512, 480, 1, 511, 512, 512, 511, 511] + - [196, 3324.0] + - - [512, 480, 1, 513, 512, 512, 513, 513] + - [245, 3295.0] + - - [512, 481, 1, 512, 512, 512, 512, 512] + - [234, 2972.0] + - - [513, 480, 1, 512, 513, 513, 512, 512] + - [256, 2950.0] + - - [480, 512, 1, 512, 480, 480, 512, 512] + - [211, 2968.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [232, 3267.0] + - - [512, 512, 1, 64, 512, 512, 64, 64] + - [198, 2012.0] + - - [2048, 114, 1, 512, 2048, 2048, 512, 512] + - [256, 2841.0] + - - [2048, 114, 1, 768, 2048, 2048, 768, 768] + - [256, 2914.0] + - - [256, 684, 1, 1024, 256, 256, 1024, 1024] + - [255, 2938.0] + - - [33, 33, 1600, 32, 33, 33, 32, 32] + - [247, 1086.0] + - - [383, 384, 1, 384, 383, 383, 384, 384] + - [211, 2991.0] + - - [385, 384, 1, 384, 385, 385, 384, 384] + - [243, 2447.0] + - - [384, 383, 1, 384, 384, 384, 384, 384] + - [211, 3027.0] + - - [384, 385, 1, 384, 384, 384, 384, 384] + - [221, 2668.0] + - - [384, 384, 1, 383, 384, 384, 383, 383] + - [256, 3033.0] + - - [384, 384, 1, 385, 384, 384, 385, 385] + - [256, 3029.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [198, 3038.0] + - - [128, 64, 25, 6498, 128, 128, 6498, 6498] + - [251, 1964.0] + - - [128, 64, 25, 6859, 128, 128, 6859, 6859] + - [255, 3540.0] + - - [64, 64, 64, 3042, 64, 64, 3042, 3042] + - [198, 3340.0] + - - [64, 64, 64, 3211, 64, 64, 3211, 3211] + - [211, 3297.0] + - - [64, 64, 49, 4050, 64, 64, 4050, 4050] + - [209, 3345.0] + - - [64, 64, 49, 4275, 64, 64, 4275, 4275] + - [221, 3364.0] + - - [64, 64, 36, 6498, 64, 64, 6498, 6498] + - [234, 3526.0] + - - [64, 64, 36, 6859, 64, 64, 6859, 6859] + - [256, 3671.0] + - - [1152, 128, 1, 1444, 1152, 1152, 1444, 1444] + - [198, 3657.0] + - - [512, 256, 1, 361, 512, 512, 361, 361] + - [256, 2667.0] + - - [576, 128, 1, 1444, 576, 576, 1444, 1444] + - [221, 2806.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [234, 3866.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [209, 3320.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [209, 3028.0] + - - [32, 32, 4608, 64, 32, 32, 64, 64] + - [258, 2456.0] + - - [32, 35, 4608, 64, 32, 32, 64, 64] + - [194, 1815.0] + - - [34, 34, 4736, 64, 34, 34, 64, 64] + - [256, 1263.0] + - - [35, 35, 4608, 64, 35, 35, 64, 64] + - [256, 1335.0] + - - [128, 864, 1, 256, 128, 128, 256, 256] + - [196, 2492.0] + - - [256, 864, 1, 512, 256, 256, 512, 512] + - [211, 3374.0] + - - [512, 256, 1, 784, 512, 512, 784, 784] + - [256, 3075.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [207, 2645.0] + - - [1024, 256, 1, 3800, 1024, 1024, 3800, 3800] + - [234, 3440.0] + - - [1024, 256, 1, 3400, 1024, 1024, 3400, 3400] + - [234, 3435.0] + - - [256, 1024, 1, 3400, 256, 256, 3400, 3400] + - [198, 3525.0] + - - [1024, 256, 1, 3220, 1024, 1024, 3220, 3220] + - [256, 3439.0] + - - [256, 1024, 1, 3220, 256, 256, 3220, 3220] + - [211, 3421.0] + - - [1024, 256, 1, 3456, 1024, 1024, 3456, 3456] + - [223, 3436.0] + - - [256, 1024, 1, 3456, 256, 256, 3456, 3456] + - [211, 3431.0] + - - [256, 1024, 1, 3072, 256, 256, 3072, 3072] + - [211, 3414.0] + - - [1024, 256, 1, 3552, 1024, 1024, 3552, 3552] + - [223, 3456.0] + - - [256, 1024, 1, 3552, 256, 256, 3552, 3552] + - [223, 3481.0] + - - [256, 1024, 1, 2852, 256, 256, 2852, 2852] + - [211, 3423.0] + - - [1024, 256, 1, 2852, 1024, 1024, 2852, 2852] + - [247, 3432.0] + - - [256, 512, 1, 10752, 256, 256, 10752, 10752] + - [211, 3378.0] + - - [256, 1024, 1, 3800, 256, 256, 3800, 3800] + - [256, 3541.0] + - - [256, 512, 1, 10560, 256, 256, 10560, 10560] + - [247, 3435.0] + - - [256, 1024, 1, 2992, 256, 256, 2992, 2992] + - [211, 3568.0] + - - [256, 1024, 1, 2688, 256, 256, 2688, 2688] + - [211, 3554.0] + - - [1024, 256, 1, 2688, 1024, 1024, 2688, 2688] + - [223, 3430.0] + - - [256, 1024, 1, 2904, 256, 256, 2904, 2904] + - [211, 3417.0] + - - [1024, 256, 1, 2904, 1024, 1024, 2904, 2904] + - [234, 3443.0] + - - [256, 1024, 1, 2640, 256, 256, 2640, 2640] + - [198, 3430.0] + - - [1024, 256, 1, 2640, 1024, 1024, 2640, 2640] + - [247, 3433.0] + - - [1024, 256, 1, 4032, 1024, 1024, 4032, 4032] + - [247, 3586.0] + - - [1024, 256, 1, 2992, 1024, 1024, 2992, 2992] + - [256, 3444.0] + - - [256, 1024, 1, 3360, 256, 256, 3360, 3360] + - [198, 3449.0] + - - [1024, 256, 1, 3360, 1024, 1024, 3360, 3360] + - [256, 3457.0] + - - [1024, 256, 1, 3500, 1024, 1024, 3500, 3500] + - [247, 3434.0] + - - [256, 1024, 1, 3500, 256, 256, 3500, 3500] + - [211, 3425.0] + - - [1024, 256, 1, 3168, 1024, 1024, 3168, 3168] + - [247, 3451.0] + - - [256, 1024, 1, 3168, 256, 256, 3168, 3168] + - [223, 3448.0] + - - [256, 1024, 1, 3036, 256, 256, 3036, 3036] + - [247, 3530.0] + - - [1024, 256, 1, 4200, 1024, 1024, 4200, 4200] + - [256, 3440.0] + - - [1024, 256, 1, 3600, 1024, 1024, 3600, 3600] + - [223, 3450.0] + - - [256, 1024, 1, 3600, 256, 256, 3600, 3600] + - [211, 3441.0] + - - [256, 1024, 1, 2944, 256, 256, 2944, 2944] + - [247, 3517.0] + - - [1024, 256, 1, 2944, 1024, 1024, 2944, 2944] + - [223, 3429.0] + - - [1024, 256, 1, 3700, 1024, 1024, 3700, 3700] + - [247, 3434.0] + - - [256, 1024, 1, 2352, 256, 256, 2352, 2352] + - [198, 3436.0] + - - [1024, 256, 1, 2352, 1024, 1024, 2352, 2352] + - [234, 3528.0] + - - [256, 1024, 1, 3700, 256, 256, 3700, 3700] + - [198, 3546.0] + - - [256, 1024, 1, 2816, 256, 256, 2816, 2816] + - [234, 3384.0] + - - [256, 512, 1, 11408, 256, 256, 11408, 11408] + - [198, 3425.0] + - - [1024, 256, 1, 3036, 1024, 1024, 3036, 3036] + - [256, 3526.0] + - - [1024, 256, 1, 3264, 1024, 1024, 3264, 3264] + - [198, 3559.0] + - - [256, 1024, 1, 3264, 256, 256, 3264, 3264] + - [211, 3445.0] + - - [1024, 256, 1, 3864, 1024, 1024, 3864, 3864] + - [256, 3448.0] + - - [256, 1024, 1, 4032, 256, 256, 4032, 4032] + - [223, 3481.0] + - - [1024, 256, 1, 3128, 1024, 1024, 3128, 3128] + - [247, 3434.0] + - - [256, 1024, 1, 3128, 256, 256, 3128, 3128] + - [223, 3528.0] + - - [256, 1024, 1, 3200, 256, 256, 3200, 3200] + - [198, 3430.0] + - - [256, 512, 1, 11616, 256, 256, 11616, 11616] + - [247, 3444.0] + - - [1024, 256, 1, 4000, 1024, 1024, 4000, 4000] + - [256, 3466.0] + - - [256, 1024, 1, 2520, 256, 256, 2520, 2520] + - [198, 3409.0] + - - [1024, 256, 1, 2520, 1024, 1024, 2520, 2520] + - [247, 3437.0] + - - [256, 1024, 1, 2976, 256, 256, 2976, 2976] + - [198, 3565.0] + - - [256, 1024, 1, 2400, 256, 256, 2400, 2400] + - [198, 3431.0] + - - [1024, 256, 1, 2400, 1024, 1024, 2400, 2400] + - [247, 3444.0] + - - [1024, 256, 1, 3696, 1024, 1024, 3696, 3696] + - [247, 3449.0] + - - [1024, 256, 1, 3900, 1024, 1024, 3900, 3900] + - [256, 3444.0] + - - [1024, 256, 1, 3772, 1024, 1024, 3772, 3772] + - [234, 3457.0] + - - [256, 1024, 1, 3696, 256, 256, 3696, 3696] + - [256, 3561.0] + - - [256, 1024, 1, 2728, 256, 256, 2728, 2728] + - [211, 3411.0] + - - [1024, 256, 1, 2728, 1024, 1024, 2728, 2728] + - [223, 3555.0] + - - [1024, 256, 1, 2480, 1024, 1024, 2480, 2480] + - [223, 3439.0] + - - [256, 1024, 1, 2480, 256, 256, 2480, 2480] + - [211, 3558.0] + - - [1024, 256, 1, 2880, 1024, 1024, 2880, 2880] + - [223, 3446.0] + - - [512, 256, 1, 3220, 512, 512, 3220, 3220] + - [256, 3302.0] + - - [256, 1024, 1, 2880, 256, 256, 2880, 2880] + - [234, 3558.0] + - - [256, 1024, 1, 4200, 256, 256, 4200, 4200] + - [211, 3443.0] + - - [1024, 256, 1, 3648, 1024, 1024, 3648, 3648] + - [223, 3583.0] + - - [1024, 256, 1, 3312, 1024, 1024, 3312, 3312] + - [234, 3553.0] + - - [256, 1024, 1, 3648, 256, 256, 3648, 3648] + - [223, 3453.0] + - - [1024, 256, 1, 3300, 1024, 1024, 3300, 3300] + - [223, 3554.0] + - - [1024, 256, 1, 3528, 1024, 1024, 3528, 3528] + - [247, 3441.0] + - - [256, 1024, 1, 2604, 256, 256, 2604, 2604] + - [247, 3510.0] + - - [1024, 256, 1, 2604, 1024, 1024, 2604, 2604] + - [234, 3419.0] + - - [512, 256, 1, 11408, 512, 512, 11408, 11408] + - [247, 3412.0] + - - [256, 1024, 1, 3312, 256, 256, 3312, 3312] + - [211, 3442.0] + - - [256, 1024, 1, 3300, 256, 256, 3300, 3300] + - [211, 3411.0] + - - [256, 1024, 1, 3528, 256, 256, 3528, 3528] + - [198, 3437.0] + - - [1024, 256, 1, 2976, 1024, 1024, 2976, 2976] + - [256, 3560.0] + - - [1024, 256, 1, 2760, 1024, 1024, 2760, 2760] + - [247, 3539.0] + - - [512, 256, 1, 3800, 512, 512, 3800, 3800] + - [211, 3313.0] + - - [256, 1024, 1, 2760, 256, 256, 2760, 2760] + - [234, 3515.0] + - - [1024, 256, 1, 2160, 1024, 1024, 2160, 2160] + - [234, 3505.0] + - - [256, 1024, 1, 2160, 256, 256, 2160, 2160] + - [198, 3435.0] + - - [512, 256, 1, 11616, 512, 512, 11616, 11616] + - [247, 3446.0] + - - [512, 256, 1, 2852, 512, 512, 2852, 2852] + - [256, 3280.0] + - - [256, 1024, 1, 3864, 256, 256, 3864, 3864] + - [211, 3435.0] + - - [512, 256, 1, 2640, 512, 512, 2640, 2640] + - [198, 3309.0] + - - [256, 1024, 1, 4000, 256, 256, 4000, 4000] + - [211, 3456.0] + - - [512, 256, 1, 2904, 512, 512, 2904, 2904] + - [223, 3276.0] + - - [256, 1024, 1, 3900, 256, 256, 3900, 3900] + - [247, 3431.0] + - - [512, 256, 1, 2688, 512, 512, 2688, 2688] + - [223, 3310.0] + - - [256, 1024, 1, 3772, 256, 256, 3772, 3772] + - [211, 3436.0] + - - [512, 256, 1, 3400, 512, 512, 3400, 3400] + - [234, 3296.0] + - - [512, 256, 1, 3456, 512, 512, 3456, 3456] + - [247, 3392.0] + - - [512, 256, 1, 3552, 512, 512, 3552, 3552] + - [247, 3367.0] + - - [29000, 35, 1, 2560, 29000, 29000, 2560, 2560] + - [216, 1870.0] + - - [29000, 36, 1, 2560, 29000, 29000, 2560, 2560] + - [216, 1755.0] + - - [29000, 39, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 1928.0] + - - [29000, 40, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2001.0] + - - [29000, 42, 1, 2560, 29000, 29000, 2560, 2560] + - [247, 2166.0] + - - [29000, 43, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2128.0] + - - [29000, 44, 1, 2560, 29000, 29000, 2560, 2560] + - [256, 2175.0] + - - [29000, 46, 1, 2560, 29000, 29000, 2560, 2560] + - [216, 2244.0] + - - [29000, 48, 1, 2560, 29000, 29000, 2560, 2560] + - [256, 2356.0] + - - [29000, 49, 1, 2560, 29000, 29000, 2560, 2560] + - [256, 2408.0] + - - [29000, 50, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2471.0] + - - [29000, 51, 1, 2560, 29000, 29000, 2560, 2560] + - [256, 2475.0] + - - [29000, 53, 1, 2560, 29000, 29000, 2560, 2560] + - [256, 2593.0] + - - [29000, 54, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2616.0] + - - [29000, 55, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2390.0] + - - [29000, 56, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2643.0] + - - [29000, 57, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2779.0] + - - [29000, 58, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 2815.0] + - - [29000, 59, 1, 2560, 29000, 29000, 2560, 2560] + - [256, 2777.0] + - - [29000, 61, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 3032.0] + - - [29000, 63, 1, 2560, 29000, 29000, 2560, 2560] + - [234, 3059.0] + - - [288, 64, 1, 21609, 288, 288, 21609, 21609] + - [284, 2774.0] + - - [32, 32, 36, 43808, 32, 32, 43808, 43808] + - [253, 1682.0] + - - [32, 32, 64, 40000, 32, 32, 40000, 40000] + - [279, 1827.0] + - - [32, 32, 49, 115200, 32, 32, 115200, 115200] + - [279, 1205.0] + - - [32, 32, 36, 175232, 32, 32, 175232, 175232] + - [271, 1266.0] + - - [32, 32, 49, 57600, 32, 32, 57600, 57600] + - [266, 884.0] + - - [32, 32, 36, 87616, 32, 32, 87616, 87616] + - [266, 947.0] + - - [32, 32, 64, 80000, 32, 32, 80000, 80000] + - [271, 1239.0] + - - [256, 128, 1, 13600, 256, 256, 13600, 13600] + - [278, 3140.0] + - - [256, 128, 1, 12880, 256, 256, 12880, 12880] + - [278, 3193.0] + - - [128, 512, 1, 15200, 128, 128, 15200, 15200] + - [275, 3404.0] + - - [512, 128, 1, 15200, 512, 512, 15200, 15200] + - [282, 3383.0] + - - [128, 512, 1, 11408, 128, 128, 11408, 11408] + - [284, 3424.0] + - - [256, 128, 1, 13824, 256, 256, 13824, 13824] + - [278, 3127.0] + - - [128, 512, 1, 11616, 128, 128, 11616, 11616] + - [278, 3395.0] + - - [256, 128, 1, 14208, 256, 256, 14208, 14208] + - [278, 3178.0] + - - [128, 512, 1, 14208, 128, 128, 14208, 14208] + - [282, 3380.0] + - - [256, 128, 1, 15200, 256, 256, 15200, 15200] + - [278, 3293.0] + - - [512, 128, 1, 11408, 512, 512, 11408, 11408] + - [275, 3378.0] + - - [512, 128, 1, 16800, 512, 512, 16800, 16800] + - [275, 3397.0] + - - [128, 512, 1, 11264, 128, 128, 11264, 11264] + - [278, 3397.0] + - - [512, 128, 1, 11616, 512, 512, 11616, 11616] + - [264, 3359.0] + - - [512, 128, 1, 16128, 512, 512, 16128, 16128] + - [275, 3352.0] + - - [512, 128, 1, 11968, 512, 512, 11968, 11968] + - [282, 3531.0] + - - [128, 512, 1, 11968, 128, 128, 11968, 11968] + - [278, 3526.0] + - - [512, 128, 1, 12288, 512, 512, 12288, 12288] + - [278, 3430.0] + - - [128, 512, 1, 12288, 128, 128, 12288, 12288] + - [278, 3361.0] + - - [128, 512, 1, 12672, 128, 128, 12672, 12672] + - [278, 3387.0] + - - [512, 128, 1, 11776, 512, 512, 11776, 11776] + - [284, 3345.0] + - - [512, 128, 1, 12144, 512, 512, 12144, 12144] + - [278, 3382.0] + - - [512, 128, 1, 11264, 512, 512, 11264, 11264] + - [282, 3279.0] + - - [128, 512, 1, 12144, 128, 128, 12144, 12144] + - [269, 3422.0] + - - [512, 128, 1, 12672, 512, 512, 12672, 12672] + - [282, 3501.0] + - - [128, 512, 1, 12512, 128, 128, 12512, 12512] + - [284, 3404.0] + - - [128, 512, 1, 11776, 128, 128, 11776, 11776] + - [284, 3380.0] + - - [256, 128, 1, 12288, 256, 256, 12288, 12288] + - [278, 2919.0] + - - [40, 40, 1, 1909283, 40, 40, 1909283, 1909283] + - [276, 432.0] + - - [40, 40, 1, 3818566, 40, 40, 3818566, 3818566] + - [272, 460.0] + - - [30522, 20, 1, 1024, 30522, 30522, 1024, 1024] + - [231, 1273.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [286, 2048.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 1280] + - [294, 562.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 256] + - [287, 321.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 3328] + - [295, 667.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [208, 1411.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 1280] + - [287, 491.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 1280] + - [298, 586.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 1280] + - [287, 407.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 128] + - [287, 231.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 128] + - [294, 267.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [291, 1727.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 256] + - [294, 283.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 128] + - [287, 363.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 256] + - [294, 375.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 3328] + - [298, 658.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [300, 1481.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 256] + - [299, 224.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 3328] + - [298, 611.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 3328] + - [294, 557.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 1280] + - [295, 609.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [254, 1634.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 3328] + - [287, 368.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [289, 2167.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [296, 1737.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 256] + - [294, 344.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 128] + - [290, 452.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 128] + - [297, 353.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 3328] + - [301, 448.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 128] + - [299, 349.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 1280] + - [294, 568.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 3328] + - [294, 544.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 128] + - [298, 198.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 128] + - [287, 159.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [289, 1879.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [303, 1641.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 1280] + - [298, 350.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 256] + - [288, 443.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 3328] + - [294, 475.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 256] + - [287, 360.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 3328] + - [302, 370.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [289, 1834.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 128] + - [287, 121.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 256] + - [298, 424.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 1280] + - [287, 553.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [289, 1951.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 1280] + - [298, 501.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 256] + - [287, 422.0] + - - [2048, 32, 1, 1001, 2048, 2048, 1001, 1001] + - [286, 1753.0] + - - [1536, 32, 1, 1001, 1536, 1536, 1001, 1001] + - [292, 1702.0] + - - [1600, 1, 1, 1024, 1600, 1600, 1024, 1024] + - [287, 87.0] + - - [32768, 1, 1, 256, 32768, 32768, 256, 256] + - [290, 155.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [287, 234.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [294, 555.0] + - - [3456, 1, 1, 256, 3456, 3456, 256, 256] + - [287, 88.0] + - - [4096, 1, 1, 256, 4096, 4096, 256, 256] + - [294, 95.0] + - - [6912, 1, 1, 256, 6912, 6912, 256, 256] + - [299, 112.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [294, 916.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [294, 289.0] + - - [29000, 27, 1, 2560, 29000, 29000, 2560, 2560] + - [293, 1363.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 3328] + - [252, 436.0] + - - [4, 1408, 1, 128, 4, 4, 128, 128] + - [304, 125.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 1280] + - [304, 461.0] + - - [4, 3584, 1, 128, 4, 4, 128, 128] + - [307, 319.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 3328] + - [311, 413.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 3328] + - [206, 352.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 3328] + - [306, 386.0] + - - [4, 4288, 1, 128, 4, 4, 128, 128] + - [304, 299.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 1280] + - [305, 551.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 3328] + - [305, 485.0] + - - [4, 5056, 1, 256, 4, 4, 256, 256] + - [304, 410.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 1280] + - [308, 562.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 3328] + - [308, 518.0] + - - [4, 1856, 1, 256, 4, 4, 256, 256] + - [304, 241.0] + - - [4, 2368, 1, 256, 4, 4, 256, 256] + - [308, 283.0] + - - [4, 2944, 1, 256, 4, 4, 256, 256] + - [305, 317.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 1280] + - [305, 529.0] + - - [4, 6784, 1, 128, 4, 4, 128, 128] + - [305, 372.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 1280] + - [304, 517.0] + - - [4, 5888, 1, 256, 4, 4, 256, 256] + - [308, 418.0] + - - [4, 6784, 1, 256, 4, 4, 256, 256] + - [304, 396.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1280] + - [206, 309.0] + - - [4, 3584, 1, 256, 4, 4, 256, 256] + - [308, 369.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 1280] + - [206, 441.0] + - - [4, 1408, 1, 256, 4, 4, 256, 256] + - [206, 185.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 3328] + - [305, 565.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 1280] + - [304, 570.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1280] + - [252, 389.0] + - - [4, 1856, 1, 128, 4, 4, 128, 128] + - [304, 166.0] + - - [4, 2944, 1, 128, 4, 4, 128, 128] + - [305, 225.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 3328] + - [305, 620.0] + - - [4, 5056, 1, 128, 4, 4, 128, 128] + - [308, 329.0] + - - [4, 4288, 1, 256, 4, 4, 256, 256] + - [308, 384.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3328] + - [308, 571.0] + - - [4, 5888, 1, 128, 4, 4, 128, 128] + - [305, 346.0] + - - [4, 2368, 1, 128, 4, 4, 128, 128] + - [308, 189.0] + - - [32, 1600, 1, 512, 32, 32, 512, 512] + - [310, 1608.0] + - - [2, 2048, 1, 1024, 2, 2, 1024, 1024] + - [304, 196.0] + - - [1, 4096, 1, 256, 1, 1, 256, 256] + - [304, 93.0] + - - [1, 6912, 1, 256, 1, 1, 256, 256] + - [305, 118.0] + - - [2, 2048, 1, 768, 2, 2, 768, 768] + - [309, 200.0] + - - [2, 4608, 1, 768, 2, 2, 768, 768] + - [305, 256.0] + - - [2, 4608, 1, 1024, 2, 2, 1024, 1024] + - [304, 262.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [277, 1188.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [262, 651.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [285, 985.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [265, 483.0] + - - [64, 80, 1, 5329, 64, 64, 5329, 5329] + - [270, 879.0] + - - [576, 96, 1, 5329, 576, 576, 5329, 5329] + - [281, 2993.0] + - - [288, 32, 1, 21609, 288, 288, 21609, 21609] + - [262, 1757.0] + - - [576, 96, 1, 5041, 576, 576, 5041, 5041] + - [274, 2943.0] + - - [27, 32, 1, 22201, 27, 27, 22201, 22201] + - [276, 308.0] + - - [160, 64, 1, 5329, 160, 160, 5329, 5329] + - [280, 1606.0] + - - [448, 64, 1, 5329, 448, 448, 5329, 5329] + - [263, 2877.0] + - - [147, 64, 1, 12544, 147, 147, 12544, 12544] + - [273, 1566.0] + - - [147, 64, 1, 22500, 147, 147, 22500, 22500] + - [280, 1823.0] + - - [576, 64, 1, 5625, 576, 576, 5625, 5625] + - [274, 2831.0] + - - [256, 128, 1, 10752, 256, 256, 10752, 10752] + - [274, 2657.0] + - - [256, 128, 1, 10560, 256, 256, 10560, 10560] + - [268, 2880.0] + - - [256, 128, 1, 11408, 256, 256, 11408, 11408] + - [274, 2750.0] + - - [256, 12, 1, 11408, 256, 256, 11408, 11408] + - [283, 789.0] + - - [256, 128, 1, 11616, 256, 256, 11616, 11616] + - [274, 2965.0] + - - [256, 12, 1, 11616, 256, 256, 11616, 11616] + - [283, 840.0] + - - [256, 12, 1, 12288, 256, 256, 12288, 12288] + - [270, 797.0] + - - [11, 11, 1, 1909283, 11, 11, 1909283, 1909283] + - [267, 39.0] + - - [11, 11, 1, 3818566, 11, 11, 3818566, 3818566] + - [272, 42.0] + - - [768, 32, 1, 768, 768, 768, 768, 768] + - [199, 1224.0] + - - [768, 64, 1, 768, 768, 768, 768, 768] + - [207, 1751.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [207, 2187.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [206, 971.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [212, 693.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [235, 242.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [235, 358.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 1280] + - [257, 189.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [254, 325.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [253, 2375.0] + - - [64, 4, 1, 256, 64, 64, 256, 256] + - [193, 9.0] + - - [64, 704, 1, 128, 64, 64, 128, 128] + - [206, 869.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [229, 1605.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 1280] + - [212, 39.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [211, 2562.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [253, 1885.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 128] + - [256, 1178.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [256, 2513.0] + - - [4, 704, 1, 256, 4, 4, 256, 256] + - [212, 102.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 1280] + - [257, 184.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [234, 1086.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [211, 2739.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 1280] + - [217, 19.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [217, 372.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [252, 1785.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [252, 1617.0] + - - [448, 4, 1, 256, 448, 448, 256, 256] + - [229, 66.0] + - - [128, 4, 1, 128, 128, 128, 128, 128] + - [220, 12.0] + - - [256, 4, 1, 128, 256, 256, 128, 128] + - [195, 23.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [255, 1921.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [229, 1088.0] + - - [704, 64, 1, 128, 704, 704, 128, 128] + - [218, 846.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 256] + - [235, 139.0] + - - [256, 256, 1, 128, 256, 256, 128, 128] + - [211, 1172.0] + - - [64, 256, 1, 128, 64, 64, 128, 128] + - [222, 377.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [242, 1839.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [211, 1492.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [252, 1613.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [218, 1775.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [252, 692.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [212, 684.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [193, 1273.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [244, 1192.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [252, 1822.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 3328] + - [235, 296.0] + - - [4, 4, 1, 256, 4, 4, 256, 256] + - [193, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [244, 797.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [229, 1608.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [206, 596.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 3328] + - [252, 150.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [211, 1705.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 1280] + - [212, 77.0] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [206, 1189.0] + - - [4, 704, 1, 128, 4, 4, 128, 128] + - [206, 62.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [230, 1486.0] + - - [448, 64, 1, 128, 448, 448, 128, 128] + - [254, 637.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1280] + - [206, 251.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 1280] + - [229, 127.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 1280] + - [235, 129.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [234, 1691.0] + - - [256, 64, 1, 128, 256, 256, 128, 128] + - [254, 381.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 3328] + - [218, 286.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [218, 201.0] + - - [704, 4, 1, 128, 704, 704, 128, 128] + - [229, 64.0] + - - [256, 4, 1, 256, 256, 256, 256, 256] + - [208, 37.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 3328] + - [204, 92.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [229, 38.0] + - - [4, 4, 1, 128, 4, 4, 128, 128] + - [195, 0.36] + - - [4, 128, 1, 256, 4, 4, 256, 256] + - [195, 19.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [257, 307.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [230, 2488.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [254, 958.0] + - - [4, 448, 1, 128, 4, 4, 128, 128] + - [206, 43.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [252, 1123.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [229, 597.0] + - - [64, 4, 1, 128, 64, 64, 128, 128] + - [193, 6.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [229, 159.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 3328] + - [235, 214.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 1280] + - [193, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [208, 384.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 128] + - [195, 89.0] + - - [4, 64, 1, 128, 4, 4, 128, 128] + - [195, 6.0] + - - [64, 1024, 1, 128, 64, 64, 128, 128] + - [221, 1149.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [229, 1110.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [229, 1023.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [259, 446.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 1280] + - [224, 272.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [253, 1475.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [199, 690.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [206, 958.0] + - - [4, 256, 1, 128, 4, 4, 128, 128] + - [205, 25.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [209, 1702.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 3328] + - [204, 2.0] + - - [704, 4, 1, 256, 704, 704, 256, 256] + - [206, 100.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 3328] + - [204, 46.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [256, 2781.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 3328] + - [257, 150.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 3328] + - [217, 47.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 3328] + - [224, 223.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [253, 2301.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [211, 2743.0] + - - [4, 1024, 1, 128, 4, 4, 128, 128] + - [206, 89.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [218, 1353.0] + - - [128, 256, 1, 128, 128, 128, 128, 128] + - [229, 683.0] + - - [128, 4, 1, 256, 128, 128, 256, 256] + - [246, 25.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [255, 2533.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [193, 1784.0] + - - [448, 4, 1, 128, 448, 448, 128, 128] + - [210, 40.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 3328] + - [204, 92.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [195, 20.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 1280] + - [217, 75.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 3328] + - [199, 23.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 3328] + - [204, 23.0] + - - [4, 1024, 1, 256, 4, 4, 256, 256] + - [206, 152.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [229, 613.0] + - - [4, 64, 1, 256, 4, 4, 256, 256] + - [193, 9.0] + - - [128, 448, 1, 128, 128, 128, 128, 128] + - [253, 1079.0] + - - [64, 448, 1, 128, 64, 64, 128, 128] + - [206, 646.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [253, 2123.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [253, 2568.0] + - - [4, 448, 1, 256, 4, 4, 256, 256] + - [206, 66.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 1280] + - [206, 42.0] + - - [128, 64, 1, 128, 128, 128, 128, 128] + - [194, 230.0] + - - [64, 64, 1, 128, 64, 64, 128, 128] + - [254, 105.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 1280] + - [228, 22.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [195, 1325.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [214, 752.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [193, 1307.0] + - - [64, 200, 1, 1024, 64, 64, 1024, 1024] + - [206, 930.0] + - - [32, 512, 1, 1024, 32, 32, 1024, 1024] + - [252, 1018.0] + - - [1, 512, 1, 1024, 1, 1, 1024, 1024] + - [206, 35.0] + - - [128, 512, 1, 2048, 128, 128, 2048, 2048] + - [209, 2744.0] + - - [64, 256, 1, 1024, 64, 64, 1024, 1024] + - [252, 1055.0] + - - [1, 200, 1, 1024, 1, 1, 1024, 1024] + - [204, 16.0] + - - [128, 512, 1, 1024, 128, 128, 1024, 1024] + - [232, 2500.0] + - - [32, 256, 1, 2048, 32, 32, 2048, 2048] + - [229, 704.0] + - - [32, 256, 1, 512, 32, 32, 512, 512] + - [257, 573.0] + - - [256, 200, 1, 1024, 256, 256, 1024, 1024] + - [207, 2003.0] + - - [1, 256, 1, 2048, 1, 1, 2048, 2048] + - [206, 22.0] + - - [32, 200, 1, 2048, 32, 32, 2048, 2048] + - [252, 547.0] + - - [128, 200, 1, 1024, 128, 128, 1024, 1024] + - [229, 1372.0] + - - [128, 256, 1, 2048, 128, 128, 2048, 2048] + - [252, 1711.0] + - - [64, 1024, 1, 1024, 64, 64, 1024, 1024] + - [232, 2455.0] + - - [1, 512, 1, 2048, 1, 1, 2048, 2048] + - [193, 41.0] + - - [128, 256, 1, 512, 128, 128, 512, 512] + - [231, 1422.0] + - - [128, 200, 1, 2048, 128, 128, 2048, 2048] + - [193, 1645.0] + - - [64, 200, 1, 512, 64, 64, 512, 512] + - [252, 658.0] + - - [1, 256, 1, 1024, 1, 1, 1024, 1024] + - [193, 17.0] + - - [1, 1024, 1, 1024, 1, 1, 1024, 1024] + - [206, 62.0] + - - [256, 256, 1, 2048, 256, 256, 2048, 2048] + - [256, 2619.0] + - - [128, 256, 1, 1024, 128, 128, 1024, 1024] + - [253, 1698.0] + - - [1, 256, 1, 4096, 1, 1, 4096, 4096] + - [217, 25.0] + - - [32, 512, 1, 512, 32, 32, 512, 512] + - [231, 1033.0] + - - [64, 200, 1, 2048, 64, 64, 2048, 2048] + - [252, 1074.0] + - - [1, 200, 1, 2048, 1, 1, 2048, 2048] + - [212, 18.0] + - - [1, 512, 1, 4096, 1, 1, 4096, 4096] + - [206, 43.0] + - - [256, 256, 1, 1024, 256, 256, 1024, 1024] + - [255, 2455.0] + - - [64, 256, 1, 2048, 64, 64, 2048, 2048] + - [252, 1228.0] + - - [1, 200, 1, 4096, 1, 1, 4096, 4096] + - [204, 18.0] + - - [32, 256, 1, 1024, 32, 32, 1024, 1024] + - [229, 568.0] + - - [32, 200, 1, 1024, 32, 32, 1024, 1024] + - [229, 436.0] + - - [32, 512, 1, 2048, 32, 32, 2048, 2048] + - [252, 1187.0] + - - [128, 200, 1, 512, 128, 128, 512, 512] + - [229, 1146.0] + - - [64, 1024, 1, 2048, 64, 64, 2048, 2048] + - [232, 2662.0] + - - [1, 1024, 1, 2048, 1, 1, 2048, 2048] + - [206, 69.0] + - - [32, 1024, 1, 512, 32, 32, 512, 512] + - [220, 1325.0] + - - [64, 1024, 1, 512, 64, 64, 512, 512] + - [211, 2159.0] + - - [1, 1024, 1, 4096, 1, 1, 4096, 4096] + - [252, 75.0] + - - [64, 256, 1, 512, 64, 64, 512, 512] + - [252, 858.0] + - - [256, 200, 1, 512, 256, 256, 512, 512] + - [230, 1713.0] + - - [32, 1024, 1, 1024, 32, 32, 1024, 1024] + - [252, 1552.0] + - - [32, 200, 1, 512, 32, 32, 512, 512] + - [229, 353.0] + - - [256, 256, 1, 512, 256, 256, 512, 512] + - [232, 2135.0] + - - [128, 512, 1, 512, 128, 128, 512, 512] + - [211, 2121.0] + - - [256, 200, 1, 2048, 256, 256, 2048, 2048] + - [230, 2187.0] + - - [64, 512, 1, 2048, 64, 64, 2048, 2048] + - [229, 1708.0] + - - [32, 1024, 1, 2048, 32, 32, 2048, 2048] + - [252, 1712.0] + - - [256, 64, 1, 1225, 256, 256, 1225, 1225] + - [224, 1000.0] + - - [384, 64, 1, 1225, 384, 384, 1225, 1225] + - [206, 1368.0] + - - [288, 64, 1, 1225, 288, 288, 1225, 1225] + - [239, 1158.0] + - - [384, 96, 1, 1225, 384, 384, 1225, 1225] + - [218, 1681.0] + - - [11, 11, 5456, 64, 11, 11, 64, 64] + - [242, 968.0] + - - [14, 14, 4368, 64, 14, 14, 64, 64] + - [206, 1363.0] + - - [23, 23, 2720, 64, 23, 23, 64, 64] + - [230, 1455.0] + - - [13, 13, 4672, 64, 13, 13, 64, 64] + - [242, 1225.0] + - - [29, 29, 2176, 64, 29, 29, 64, 64] + - [200, 1882.0] + - - [12, 12, 5040, 64, 12, 12, 64, 64] + - [252, 1095.0] + - - [27, 27, 2336, 64, 27, 27, 64, 64] + - [258, 1649.0] + - - [10, 10, 5952, 64, 10, 10, 64, 64] + - [229, 833.0] + - - [7, 7, 8192, 64, 7, 7, 64, 64] + - [229, 413.0] + - - [16, 16, 3840, 64, 16, 16, 64, 64] + - [257, 1574.0] + - - [17, 17, 3632, 64, 17, 17, 64, 64] + - [230, 954.0] + - - [9, 9, 6544, 64, 9, 9, 64, 64] + - [206, 678.0] + - - [8, 8, 7280, 64, 8, 8, 64, 64] + - [206, 566.0] + - - [21, 21, 2976, 64, 21, 21, 64, 64] + - [243, 1369.0] + - - [19, 19, 3264, 64, 19, 19, 64, 64] + - [230, 1180.0] + - - [25, 25, 2512, 64, 25, 25, 64, 64] + - [243, 1441.0] + - - [18, 18, 3440, 64, 18, 18, 64, 64] + - [253, 1084.0] + - - [15, 15, 4096, 64, 15, 15, 64, 64] + - [235, 1446.0] + - - [2, 16, 1, 768, 2, 2, 768, 768] + - [193, 2.0] + - - [2, 8, 1, 768, 2, 2, 768, 768] + - [193, 1.0] + - - [2, 64, 1, 768, 2, 2, 768, 768] + - [193, 8.0] + - - [256, 128, 1, 784, 256, 256, 784, 784] + - [193, 1495.0] + - - [192, 48, 1, 1225, 192, 192, 1225, 1225] + - [235, 634.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [242, 1330.0] + - - [512, 144, 1, 196, 512, 512, 196, 196] + - [232, 1602.0] + - - [400, 32, 1, 784, 400, 400, 784, 784] + - [248, 773.0] + - - [832, 48, 1, 49, 832, 832, 49, 49] + - [220, 422.0] + - - [192, 32, 1, 784, 192, 192, 784, 784] + - [212, 384.0] + - - [288, 48, 1, 1225, 288, 288, 1225, 1225] + - [248, 920.0] + - - [512, 112, 1, 196, 512, 512, 196, 196] + - [232, 1271.0] + - - [528, 32, 1, 196, 528, 528, 196, 196] + - [212, 484.0] + - - [576, 64, 1, 3136, 576, 576, 3136, 3136] + - [196, 2042.0] + - - [480, 64, 1, 196, 480, 480, 196, 196] + - [252, 827.0] + - - [192, 64, 1, 784, 192, 192, 784, 784] + - [248, 743.0] + - - [192, 32, 1, 1225, 192, 192, 1225, 1225] + - [235, 423.0] + - - [400, 48, 1, 196, 400, 400, 196, 196] + - [214, 756.0] + - - [480, 16, 1, 196, 480, 480, 196, 196] + - [226, 347.0] + - - [512, 64, 1, 196, 512, 512, 196, 196] + - [218, 1264.0] + - - [800, 64, 1, 196, 800, 800, 196, 196] + - [242, 1178.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [256, 2397.0] + - - [256, 64, 1, 784, 256, 256, 784, 784] + - [248, 900.0] + - - [256, 48, 1, 1225, 256, 256, 1225, 1225] + - [212, 816.0] + - - [192, 16, 1, 784, 192, 192, 784, 784] + - [212, 196.0] + - - [576, 96, 1, 1225, 576, 576, 1225, 1225] + - [196, 2142.0] + - - [512, 128, 1, 196, 512, 512, 196, 196] + - [209, 1437.0] + - - [192, 96, 1, 784, 192, 192, 784, 784] + - [229, 1049.0] + - - [192, 64, 1, 1225, 192, 192, 1225, 1225] + - [248, 819.0] + - - [512, 32, 1, 196, 512, 512, 196, 196] + - [224, 457.0] + - - [528, 128, 1, 196, 528, 528, 196, 196] + - [209, 1469.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [256, 2401.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [204, 371.0] + - - [256, 32, 1, 784, 256, 256, 784, 784] + - [195, 554.0] + - - [480, 96, 1, 196, 480, 480, 196, 196] + - [193, 1086.0] + - - [1024, 32, 1, 1001, 1024, 1024, 1001, 1001] + - [193, 1460.0] + - - [18, 18, 648, 64, 18, 18, 64, 64] + - [243, 924.0] + - - [7, 7, 736, 64, 7, 7, 64, 64] + - [229, 330.0] + - - [8, 8, 264, 64, 8, 8, 64, 64] + - [193, 304.0] + - - [9, 9, 416, 64, 9, 9, 64, 64] + - [218, 449.0] + - - [10, 10, 448, 64, 10, 10, 64, 64] + - [235, 478.0] + - - [11, 11, 568, 64, 11, 11, 64, 64] + - [212, 611.0] + - - [12, 12, 480, 64, 12, 12, 64, 64] + - [229, 587.0] + - - [12, 12, 2520, 64, 12, 12, 64, 64] + - [229, 987.0] + - - [13, 13, 576, 64, 13, 13, 64, 64] + - [229, 710.0] + - - [13, 13, 2336, 64, 13, 13, 64, 64] + - [224, 1119.0] + - - [14, 14, 704, 64, 14, 14, 64, 64] + - [212, 824.0] + - - [14, 14, 2184, 64, 14, 14, 64, 64] + - [229, 1232.0] + - - [15, 15, 688, 64, 15, 15, 64, 64] + - [224, 880.0] + - - [15, 15, 2048, 64, 15, 15, 64, 64] + - [248, 1289.0] + - - [16, 16, 712, 64, 16, 16, 64, 64] + - [224, 1068.0] + - - [16, 16, 1920, 64, 16, 16, 64, 64] + - [248, 1409.0] + - - [17, 17, 688, 64, 17, 17, 64, 64] + - [207, 710.0] + - - [17, 17, 1816, 64, 17, 17, 64, 64] + - [207, 892.0] + - - [18, 18, 1720, 64, 18, 18, 64, 64] + - [253, 1008.0] + - - [19, 19, 680, 64, 19, 19, 64, 64] + - [219, 874.0] + - - [19, 19, 1632, 64, 19, 19, 64, 64] + - [253, 1077.0] + - - [21, 21, 1472, 64, 21, 21, 64, 64] + - [207, 1236.0] + - - [21, 21, 1488, 64, 21, 21, 64, 64] + - [194, 1269.0] + - - [23, 23, 64, 64, 23, 23, 64, 64] + - [222, 372.0] + - - [23, 23, 1360, 64, 23, 23, 64, 64] + - [230, 1432.0] + - - [25, 25, 176, 64, 25, 25, 64, 64] + - [246, 1029.0] + - - [25, 25, 1256, 64, 25, 25, 64, 64] + - [194, 1388.0] + - - [26, 26, 56, 64, 26, 26, 64, 64] + - [222, 767.0] + - - [26, 27, 56, 64, 26, 26, 64, 64] + - [212, 673.0] + - - [27, 27, 56, 64, 27, 27, 64, 64] + - [195, 487.0] + - - [27, 27, 1168, 64, 27, 27, 64, 64] + - [225, 1572.0] + - - [29, 29, 136, 64, 29, 29, 64, 64] + - [243, 953.0] + - - [29, 29, 1088, 64, 29, 29, 64, 64] + - [236, 1739.0] + - - [256, 1, 1, 4, 256, 256, 4, 4] + - [202, 0.26] + - - [2, 1, 1, 1024, 2, 2, 1024, 1024] + - [193, 0.16] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [212, 61.0] + - - [2, 6, 1, 1024, 2, 2, 1024, 1024] + - [193, 1.0] + - - [2, 8, 1, 1024, 2, 2, 1024, 1024] + - [193, 1.0] + - - [14, 14, 1, 64, 14, 14, 64, 64] + - [195, 3.0] + - - [15, 14, 1, 64, 15, 15, 64, 64] + - [193, 3.0] + - - [15, 15, 1, 64, 15, 15, 64, 64] + - [195, 6.0] + - - [17, 15, 1, 64, 17, 17, 64, 64] + - [193, 3.0] + - - [17, 17, 1, 64, 17, 17, 64, 64] + - [193, 4.0] + - - [30, 30, 1, 64, 30, 30, 64, 64] + - [233, 21.0] + - - [30, 31, 1, 64, 30, 30, 64, 64] + - [220, 22.0] + - - [31, 31, 1, 64, 31, 31, 64, 64] + - [232, 16.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [229, 1459.0] + - - [2, 32, 1, 1024, 2, 2, 1024, 1024] + - [193, 4.0] + - - [2, 4, 1, 1024, 2, 2, 1024, 1024] + - [193, 1.0] + - - [64, 512, 1, 512, 64, 64, 512, 512] + - [252, 1327.0] + - - [64, 960, 1, 1024, 64, 64, 1024, 1024] + - [209, 2296.0] + - - [200, 1, 1, 1024, 200, 200, 1024, 1024] + - [199, 14.0] + - - [512, 1, 1, 2048, 512, 512, 2048, 2048] + - [212, 40.0] + - - [64, 512, 1, 1024, 64, 64, 1024, 1024] + - [229, 1548.0] + - - [3, 3, 512, 64, 3, 3, 64, 64] + - [193, 38.0] + - - [5, 5, 512, 64, 5, 5, 64, 64] + - [206, 107.0] + - - [9, 9, 512, 64, 9, 9, 64, 64] + - [206, 474.0] + - - [128, 256, 1, 1444, 128, 128, 1444, 1444] + - [242, 1799.0] + - - [256, 128, 1, 25, 256, 256, 25, 25] + - [244, 190.0] + - - [256, 128, 1, 9, 256, 256, 9, 9] + - [197, 73.0] + - - [256, 256, 1, 1444, 256, 256, 1444, 1444] + - [255, 2652.0] + - - [512, 128, 1, 100, 512, 512, 100, 100] + - [232, 1489.0] + - - [64, 128, 1, 1444, 64, 64, 1444, 1444] + - [237, 648.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [209, 2205.0] + - - [2, 10, 1, 1024, 2, 2, 1024, 1024] + - [224, 2.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [229, 583.0] + - - [2, 39, 1, 1024, 2, 2, 1024, 1024] + - [204, 6.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [229, 1588.0] + - - [2, 40, 1, 1024, 2, 2, 1024, 1024] + - [195, 6.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [211, 1628.0] + - - [2, 41, 1, 1024, 2, 2, 1024, 1024] + - [217, 7.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [234, 1533.0] + - - [2, 5, 1, 1024, 2, 2, 1024, 1024] + - [193, 1.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [235, 297.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [212, 464.0] + - - [2, 9, 1, 1024, 2, 2, 1024, 1024] + - [193, 1.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [212, 524.0] + - - [4, 4, 32768, 64, 4, 4, 64, 64] + - [206, 143.0] + - - [4, 4, 38400, 64, 4, 4, 64, 64] + - [229, 139.0] + - - [14, 14, 10880, 64, 14, 14, 64, 64] + - [261, 566.0] + - - [15, 14, 10880, 64, 15, 15, 64, 64] + - [261, 540.0] + - - [15, 15, 7680, 64, 15, 15, 64, 64] + - [257, 1386.0] + - - [15, 15, 10880, 64, 15, 15, 64, 64] + - [235, 1594.0] + - - [17, 15, 7680, 64, 17, 17, 64, 64] + - [208, 747.0] + - - [17, 17, 6144, 64, 17, 17, 64, 64] + - [253, 999.0] + - - [17, 17, 7680, 64, 17, 17, 64, 64] + - [207, 703.0] + - - [21, 17, 6144, 64, 21, 21, 64, 64] + - [219, 942.0] + - - [21, 21, 6144, 64, 21, 21, 64, 64] + - [213, 860.0] + - - [24, 24, 4736, 64, 24, 24, 64, 64] + - [253, 1691.0] + - - [30, 30, 2048, 64, 30, 30, 64, 64] + - [200, 2014.0] + - - [30, 31, 2048, 64, 30, 30, 64, 64] + - [200, 2072.0] + - - [31, 31, 2048, 64, 31, 31, 64, 64] + - [200, 2111.0] + - - [34, 24, 4736, 64, 34, 34, 64, 64] + - [209, 1609.0] + - - [128, 128, 1, 64, 128, 128, 64, 64] + - [195, 328.0] + - - [2, 1024, 1, 1024, 2, 2, 1024, 1024] + - [206, 122.0] + - - [5, 5, 1, 64, 5, 5, 64, 64] + - [195, 0.31] + - - [33, 33, 1, 32, 33, 33, 32, 32] + - [195, 8.0] + - - [5, 5, 960, 64, 5, 5, 64, 64] + - [206, 142.0] + - - [27, 27, 32768, 128, 27, 27, 128, 128] + - [207, 845.0] + - - [960, 1, 1, 2048, 960, 960, 2048, 2048] + - [229, 57.0] + - - [2, 2, 1, 2048, 2, 2, 2048, 2048] + - [199, 0.33] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [206, 936.0] + - - [2, 16, 1, 1024, 2, 2, 1024, 1024] + - [193, 2.0] + - - [2, 4, 1, 2560, 2, 2, 2560, 2560] + - [193, 1.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [234, 2346.0] + - - [2, 64, 1, 1024, 2, 2, 1024, 1024] + - [193, 9.0] + - - [864, 1, 1, 256, 864, 864, 256, 256] + - [212, 31.0] + - - [2, 80, 1, 1024, 2, 2, 1024, 1024] + - [199, 11.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [207, 2239.0] + - - [2, 82, 1, 1024, 2, 2, 1024, 1024] + - [193, 11.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [229, 703.0] + - - [2, 12, 1, 1024, 2, 2, 1024, 1024] + - [193, 2.0] + - - [24, 24, 6816, 64, 24, 24, 64, 64] + - [207, 1518.0] + - - [26, 26, 6272, 64, 26, 26, 64, 64] + - [258, 1651.0] + - - [256, 128, 1, 3136, 256, 256, 3136, 3136] + - [194, 1883.0] + - - [2, 128, 1, 1024, 2, 2, 1024, 1024] + - [193, 16.0] + - - [2, 96, 1, 1024, 2, 2, 1024, 1024] + - [193, 13.0] + - - [768, 12, 1, 768, 768, 768, 768, 768] + - [257, 521.0] + - - [768, 4, 1, 768, 768, 768, 768, 768] + - [235, 176.0] + - - [256, 80, 1, 784, 256, 256, 784, 784] + - [199, 1126.0] + - - [256, 12, 1, 3800, 256, 256, 3800, 3800] + - [204, 282.0] + - - [256, 3, 1, 3800, 256, 256, 3800, 3800] + - [204, 70.0] + - - [256, 12, 1, 950, 256, 256, 950, 950] + - [212, 204.0] + - - [256, 3, 1, 950, 256, 256, 950, 950] + - [204, 51.0] + - - [256, 12, 1, 3220, 256, 256, 3220, 3220] + - [204, 275.0] + - - [256, 3, 1, 3220, 256, 256, 3220, 3220] + - [204, 69.0] + - - [256, 12, 1, 3072, 256, 256, 3072, 3072] + - [217, 272.0] + - - [256, 3, 1, 3072, 256, 256, 3072, 3072] + - [212, 69.0] + - - [256, 12, 1, 850, 256, 256, 850, 850] + - [248, 229.0] + - - [256, 3, 1, 850, 256, 256, 850, 850] + - [199, 50.0] + - - [256, 12, 1, 2852, 256, 256, 2852, 2852] + - [250, 272.0] + - - [256, 3, 1, 2852, 256, 256, 2852, 2852] + - [204, 67.0] + - - [256, 12, 1, 805, 256, 256, 805, 805] + - [212, 194.0] + - - [256, 3, 1, 805, 256, 256, 805, 805] + - [199, 48.0] + - - [256, 3, 1, 864, 256, 256, 864, 864] + - [248, 52.0] + - - [256, 3, 1, 768, 256, 256, 768, 768] + - [199, 56.0] + - - [256, 12, 1, 864, 256, 256, 864, 864] + - [257, 201.0] + - - [256, 12, 1, 768, 256, 256, 768, 768] + - [212, 194.0] + - - [256, 12, 1, 2904, 256, 256, 2904, 2904] + - [204, 269.0] + - - [256, 3, 1, 2904, 256, 256, 2904, 2904] + - [204, 71.0] + - - [256, 3, 1, 713, 256, 256, 713, 713] + - [212, 46.0] + - - [256, 12, 1, 888, 256, 256, 888, 888] + - [212, 203.0] + - - [256, 3, 1, 888, 256, 256, 888, 888] + - [212, 50.0] + - - [256, 12, 1, 713, 256, 256, 713, 713] + - [249, 197.0] + - - [256, 3, 1, 660, 256, 256, 660, 660] + - [224, 57.0] + - - [256, 3, 1, 672, 256, 256, 672, 672] + - [212, 46.0] + - - [256, 12, 1, 660, 256, 256, 660, 660] + - [212, 179.0] + - - [256, 3, 1, 726, 256, 256, 726, 726] + - [212, 47.0] + - - [256, 12, 1, 672, 256, 256, 672, 672] + - [212, 184.0] + - - [256, 3, 1, 247, 256, 256, 247, 247] + - [199, 27.0] + - - [256, 12, 1, 726, 256, 256, 726, 726] + - [235, 188.0] + - - [256, 3, 1, 216, 256, 256, 216, 216] + - [235, 25.0] + - - [256, 3, 1, 3400, 256, 256, 3400, 3400] + - [204, 69.0] + - - [256, 3, 1, 221, 256, 256, 221, 221] + - [212, 25.0] + - - [256, 12, 1, 3552, 256, 256, 3552, 3552] + - [204, 281.0] + - - [256, 3, 1, 3456, 256, 256, 3456, 3456] + - [204, 70.0] + - - [256, 3, 1, 204, 256, 256, 204, 204] + - [199, 23.0] + - - [256, 12, 1, 3400, 256, 256, 3400, 3400] + - [204, 277.0] + - - [256, 12, 1, 3456, 256, 256, 3456, 3456] + - [204, 279.0] + - - [256, 12, 1, 221, 256, 256, 221, 221] + - [257, 101.0] + - - [256, 3, 1, 3552, 256, 256, 3552, 3552] + - [204, 70.0] + - - [256, 3, 1, 228, 256, 256, 228, 228] + - [193, 33.0] + - - [256, 3, 1, 234, 256, 256, 234, 234] + - [199, 26.0] + - - [256, 12, 1, 234, 256, 256, 234, 234] + - [212, 105.0] + - - [81, 1024, 1, 1024, 81, 81, 1024, 1024] + - [253, 2200.0] + - - [81, 1000, 1, 1024, 81, 81, 1024, 1024] + - [253, 2156.0] + - - [256, 12, 1, 228, 256, 256, 228, 228] + - [248, 101.0] + - - [256, 3, 1, 252, 256, 256, 252, 252] + - [212, 28.0] + - - [256, 12, 1, 252, 256, 256, 252, 252] + - [201, 109.0] + - - [256, 12, 1, 247, 256, 256, 247, 247] + - [195, 117.0] + - - [1024, 6, 1, 2, 1024, 1024, 2, 2] + - [193, 3.0] + - - [2, 8, 1, 2048, 2, 2, 2048, 2048] + - [193, 1.0] + - - [2, 20, 1, 1024, 2, 2, 1024, 1024] + - [193, 3.0] + - - [2, 2, 1, 2560, 2, 2, 2560, 2560] + - [235, 0.35] +- null +- null +- DeviceEfficiency +... diff --git a/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB_GB.yaml b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB_GB.yaml new file mode 100644 index 000000000..9147d7a7c --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/navi22_Cijk_Alik_Bljk_SB_GB.yaml @@ -0,0 +1,78982 @@ +--- +- {MinimumRequiredVersion: 4.28.0} +- navi22 +- gfx1031 +- [Device 73df] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x8_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x8_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU0_SUM0_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_16_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x256x16_SN_SU32_SUM3_TT8_16_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x8_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x16_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 101 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 102 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 103 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 104 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 105 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 106 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 107 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 108 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 109 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 110 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 111 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 112 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 113 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 114 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 115 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 116 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 117 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 118 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 119 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 120 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 121 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 122 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 123 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 124 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 125 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 126 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 127 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x8_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 128 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 129 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 130 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 131 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 132 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 133 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x32_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 134 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 135 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 136 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 137 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 138 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 139 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 140 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 141 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 142 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 143 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 144 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 145 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 146 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 147 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 148 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 149 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 150 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 151 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 152 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 153 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 154 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x8_SN_SU0_SUM0_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 155 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 156 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 157 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 158 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x8_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 159 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 160 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 161 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 162 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 163 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 164 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 165 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 166 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x32_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 167 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 168 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x64x8_SN_SU32_SUM3_TT4_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 2 + LVCB: 2 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 169 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 32 + LVCA: 2 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 170 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 171 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 64 + LVCA: 2 + LVCB: 2 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 172 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x8_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 64 + LSPB: 128 + LVCA: 4 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 173 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x8_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 174 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 175 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 176 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 177 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 178 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 179 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 180 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 181 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 182 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x32_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 183 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 184 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 185 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 186 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x32_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 187 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 188 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 189 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 190 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 191 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 192 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x32_SN_SU0_SUM0_TT4_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 193 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 194 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 195 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 196 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 197 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 198 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 199 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 200 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 201 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 202 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU32_SUM3_TT4_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 203 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 204 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x32_SN_SU32_SUM3_TT8_8_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 205 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 206 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 128 + LSPB: 128 + LVCA: 2 + LVCB: 2 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 207 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x8_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 208 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x16_SN_SU0_SUM0_TT8_8_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 209 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 210 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU0_SUM0_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 211 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x32_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 14336 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 212 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x32_SN_SU0_SUM0_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 213 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x64x16_SN_SU32_SUM3_TT8_8_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 214 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x128x16_SN_SU32_SUM3_TT4_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 64 + LVCA: 4 + LVCB: 4 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 215 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x16_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 4 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 216 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT128x128x32_SN_SU32_SUM3_TT8_8_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 217 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 218 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 219 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 220 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 221 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 222 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 223 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 224 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 225 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x16_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 226 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 227 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 228 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 229 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 230 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 231 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_2_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 232 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 233 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 234 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 235 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 236 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 237 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 238 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 239 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU0_SUM0_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 240 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 241 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 242 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 243 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 244 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 245 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 246 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 247 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 248 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 249 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 250 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 251 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 252 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 253 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 254 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU0_SUM0_TT2_2_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 255 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 256 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 257 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 258 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 259 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 260 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 261 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 262 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 263 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 264 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 265 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x16_SN_SU32_SUM3_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 266 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 267 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 268 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 269 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 270 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 271 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 272 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 273 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 274 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 275 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 276 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 277 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 278 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU0_SUM0_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 279 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 280 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 281 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 282 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 283 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x16_SN_SU32_SUM3_TT4_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 284 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x32x8_SN_SU0_SUM0_TT4_4_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 285 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU0_SUM0_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 286 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 287 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU0_SUM0_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 288 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x8_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 289 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x16x8_SN_SU32_SUM3_TT2_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 290 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x64x8_SN_SU32_SUM3_TT4_4_WG16_16_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 291 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x16_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 4 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 32 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 292 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT16x16x32_SN_SU32_SUM3_TT2_2_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 293 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 294 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 295 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 3136 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 16 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 4 + MacroTileA: 64 + MacroTileB: 4 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 16 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 296 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x4x16_SN_SU32_SUM3_TT4_1_WG16_4_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 1] + ThreadTile0: 4 + ThreadTile1: 1 + ThreadTileA: 4 + ThreadTileB: 1 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 1 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 297 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 298 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 299 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 300 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x16x8_SN_SU0_SUM0_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 301 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x8x8_SN_SU32_SUM3_TT2_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1600 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 302 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x8x8_SN_SU32_SUM3_TT4_2_WG16_4_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 303 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT64x16x8_SN_SU32_SUM3_TT4_2_WG16_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 304 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM1 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 305 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 306 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT32x32x8_SN_SU32_SUM3_TT2_4_WG16_8_1_WGM4 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: V3 + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [10, 3, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsNumElements: 832 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 64 + LdsOffsetB_Blk: 576 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 307 + SolutionNameMin: Cijk_Alik_Bljk_SB_GB_MT8x32x8_SN_SU32_SUM3_TT1_4_WG8_8_1_WGM8 + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 3 + StaggerUStride: 128 + StoreRemapVectorWidth: 0 + StoreVectorWidth: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [1, 4] + ThreadTile0: 1 + ThreadTile1: 4 + ThreadTileA: 1 + ThreadTileB: 4 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [1024, 4096, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10800.0] + - - [4096, 4096, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8798.0] + - - [1024, 4096, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7704.0] + - - [30528, 4096, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 10973.0] + - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 9500.0] + - - [4096, 2048, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11431.0] + - - [1024, 2048, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 10175.0] + - - [30528, 2048, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 10252.0] + - - [30522, 320, 1, 768, 30522, 30522, 768, 768] + - [58, 9801.0] + - - [3072, 4096, 1, 768, 3072, 3072, 768, 768] + - [16, 11748.0] + - - [768, 4096, 1, 3072, 768, 768, 3072, 3072] + - [40, 11400.0] + - - [768, 4096, 1, 768, 768, 768, 768, 768] + - [40, 10903.0] + - - [30522, 160, 1, 768, 30522, 30522, 768, 768] + - [57, 8104.0] + - - [30522, 640, 1, 768, 30522, 30522, 768, 768] + - [16, 11743.0] + - - [30522, 1280, 1, 768, 30522, 30522, 768, 768] + - [46, 9655.0] + - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10278.0] + - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] + - [16, 10523.0] + - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] + - [40, 11377.0] + - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] + - [38, 11225.0] + - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] + - [38, 11447.0] + - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] + - [58, 10135.0] + - - [30522, 160, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 7355.0] + - - [128, 128, 512, 64, 128, 128, 64, 64] + - [59, 9168.0] + - - [512, 512, 64, 64, 512, 512, 64, 64] + - [5, 10616.0] + - - [256, 256, 192, 64, 256, 256, 64, 64] + - [46, 10303.0] + - - [256, 256, 96, 64, 256, 256, 64, 64] + - [46, 9806.0] + - - [128, 128, 384, 64, 128, 128, 64, 64] + - [6, 9222.0] + - - [128, 128, 96, 64, 128, 128, 64, 64] + - [2, 7252.0] + - - [512, 512, 16, 64, 512, 512, 64, 64] + - [17, 7354.0] + - - [512, 512, 96, 64, 512, 512, 64, 64] + - [0, 8094.0] + - - [512, 512, 128, 64, 512, 512, 64, 64] + - [4, 7621.0] + - - [2944, 4288, 1, 1280, 2944, 2944, 1280, 1280] + - [49, 9890.0] + - - [2368, 5888, 1, 256, 2368, 2368, 256, 256] + - [49, 11075.0] + - - [5888, 1856, 1, 256, 5888, 5888, 256, 256] + - [16, 11052.0] + - - [512, 24000, 1, 1536, 512, 512, 1536, 1536] + - [41, 7598.0] + - - [5888, 1408, 1, 256, 5888, 5888, 256, 256] + - [16, 10019.0] + - - [5888, 1856, 1, 3328, 5888, 5888, 3328, 3328] + - [16, 9555.0] + - - [5056, 704, 1, 256, 5056, 5056, 256, 256] + - [13, 8358.0] + - - [5888, 2944, 1, 3328, 5888, 5888, 3328, 3328] + - [38, 10478.0] + - - [1856, 4288, 1, 256, 1856, 1856, 256, 256] + - [14, 9516.0] + - - [1024, 5056, 1, 128, 1024, 1024, 128, 128] + - [38, 9076.0] + - - [5056, 5056, 1, 3328, 5056, 5056, 3328, 3328] + - [51, 11685.0] + - - [1408, 5888, 1, 1280, 1408, 1408, 1280, 1280] + - [18, 11604.0] + - - [2368, 6784, 1, 128, 2368, 2368, 128, 128] + - [34, 9359.0] + - - [1024, 3584, 1, 3328, 1024, 1024, 3328, 3328] + - [29, 10948.0] + - - [512, 48000, 1, 2048, 512, 512, 2048, 2048] + - [49, 9808.0] + - - [5888, 1408, 1, 1280, 5888, 5888, 1280, 1280] + - [58, 11414.0] + - - [1024, 2368, 1, 256, 1024, 1024, 256, 256] + - [37, 8901.0] + - - [1408, 1856, 1, 1280, 1408, 1408, 1280, 1280] + - [16, 10141.0] + - - [6144, 24000, 1, 2048, 6144, 6144, 2048, 2048] + - [38, 11123.0] + - - [5056, 5056, 1, 1280, 5056, 5056, 1280, 1280] + - [18, 9817.0] + - - [448, 5056, 1, 256, 448, 448, 256, 256] + - [39, 8530.0] + - - [1760, 6400, 1, 1760, 1760, 1760, 1760, 1760] + - [51, 10062.0] + - - [1856, 1408, 1, 128, 1856, 1856, 128, 128] + - [35, 8460.0] + - - [6784, 256, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 10079.0] + - - [6784, 4288, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 11003.0] + - - [4288, 448, 1, 256, 4288, 4288, 256, 256] + - [13, 7202.0] + - - [1856, 2368, 1, 3328, 1856, 1856, 3328, 3328] + - [16, 10515.0] + - - [4288, 2944, 1, 1280, 4288, 4288, 1280, 1280] + - [16, 10492.0] + - - [704, 5056, 1, 1280, 704, 704, 1280, 1280] + - [60, 10440.0] + - - [2368, 704, 1, 3328, 2368, 2368, 3328, 3328] + - [16, 9652.0] + - - [256, 5888, 1, 256, 256, 256, 256, 256] + - [38, 8652.0] + - - [1856, 4288, 1, 3328, 1856, 1856, 3328, 3328] + - [28, 9926.0] + - - [5888, 1024, 1, 256, 5888, 5888, 256, 256] + - [38, 10850.0] + - - [16384, 3200, 1, 4096, 16384, 16384, 4096, 4096] + - [18, 10127.0] + - - [1408, 2944, 1, 256, 1408, 1408, 256, 256] + - [35, 9443.0] + - - [6784, 5056, 1, 3328, 6784, 6784, 3328, 3328] + - [40, 11861.0] + - - [5056, 5056, 1, 256, 5056, 5056, 256, 256] + - [16, 11535.0] + - - [1408, 6784, 1, 128, 1408, 1408, 128, 128] + - [23, 11051.0] + - - [704, 5056, 1, 128, 704, 704, 128, 128] + - [34, 8958.0] + - - [2368, 2944, 1, 1280, 2368, 2368, 1280, 1280] + - [16, 11545.0] + - - [6784, 6784, 1, 1280, 6784, 6784, 1280, 1280] + - [16, 10927.0] + - - [1408, 4288, 1, 1280, 1408, 1408, 1280, 1280] + - [38, 11124.0] + - - [3584, 4288, 1, 1280, 3584, 3584, 1280, 1280] + - [40, 11816.0] + - - [2368, 704, 1, 1280, 2368, 2368, 1280, 1280] + - [16, 9409.0] + - - [5056, 4288, 1, 3328, 5056, 5056, 3328, 3328] + - [16, 10742.0] + - - [3584, 2368, 1, 3328, 3584, 3584, 3328, 3328] + - [40, 9398.0] + - - [6784, 448, 1, 1280, 6784, 6784, 1280, 1280] + - [16, 9364.0] + - - [1408, 2944, 1, 128, 1408, 1408, 128, 128] + - [34, 9689.0] + - - [4288, 2944, 1, 256, 4288, 4288, 256, 256] + - [27, 11149.0] + - - [5888, 704, 1, 1280, 5888, 5888, 1280, 1280] + - [29, 10649.0] + - - [448, 5888, 1, 128, 448, 448, 128, 128] + - [46, 7823.0] + - - [5056, 2368, 1, 1280, 5056, 5056, 1280, 1280] + - [27, 10203.0] + - - [448, 3584, 1, 1280, 448, 448, 1280, 1280] + - [40, 9207.0] + - - [6784, 5888, 1, 256, 6784, 6784, 256, 256] + - [60, 11946.0] + - - [1024, 1408, 1, 256, 1024, 1024, 256, 256] + - [28, 8254.0] + - - [2368, 2368, 1, 3328, 2368, 2368, 3328, 3328] + - [16, 10590.0] + - - [1856, 6784, 1, 128, 1856, 1856, 128, 128] + - [35, 9512.0] + - - [5056, 704, 1, 3328, 5056, 5056, 3328, 3328] + - [51, 9099.0] + - - [1408, 1856, 1, 256, 1408, 1408, 256, 256] + - [49, 8492.0] + - - [2368, 5056, 1, 256, 2368, 2368, 256, 256] + - [27, 10985.0] + - - [3584, 2368, 1, 1280, 3584, 3584, 1280, 1280] + - [38, 11345.0] + - - [704, 5888, 1, 256, 704, 704, 256, 256] + - [58, 9236.0] + - - [6784, 2944, 1, 128, 6784, 6784, 128, 128] + - [58, 9355.0] + - - [2560, 1600, 1, 2560, 2560, 2560, 2560, 2560] + - [58, 11058.0] + - - [4288, 6784, 1, 3328, 4288, 4288, 3328, 3328] + - [58, 11191.0] + - - [2944, 6784, 1, 3328, 2944, 2944, 3328, 3328] + - [16, 10964.0] + - - [6144, 5984, 1, 2048, 6144, 6144, 2048, 2048] + - [58, 9364.0] + - - [3584, 704, 1, 3328, 3584, 3584, 3328, 3328] + - [16, 9852.0] + - - [2048, 1600, 1, 512, 2048, 2048, 512, 512] + - [38, 9761.0] + - - [448, 4288, 1, 256, 448, 448, 256, 256] + - [38, 8307.0] + - - [1856, 4288, 1, 128, 1856, 1856, 128, 128] + - [27, 9842.0] + - - [704, 2368, 1, 1280, 704, 704, 1280, 1280] + - [60, 9421.0] + - - [1856, 2368, 1, 1280, 1856, 1856, 1280, 1280] + - [38, 10449.0] + - - [1856, 4288, 1, 1280, 1856, 1856, 1280, 1280] + - [18, 11088.0] + - - [704, 2944, 1, 128, 704, 704, 128, 128] + - [53, 8029.0] + - - [1408, 1024, 1, 1280, 1408, 1408, 1280, 1280] + - [58, 9666.0] + - - [704, 6784, 1, 256, 704, 704, 256, 256] + - [58, 9462.0] + - - [6784, 704, 1, 256, 6784, 6784, 256, 256] + - [16, 9794.0] + - - [5056, 1408, 1, 128, 5056, 5056, 128, 128] + - [34, 10051.0] + - - [2048, 7000, 1, 2048, 2048, 2048, 2048, 2048] + - [35, 8473.0] + - - [3584, 4288, 1, 3328, 3584, 3584, 3328, 3328] + - [38, 10297.0] + - - [5888, 1856, 1, 1280, 5888, 5888, 1280, 1280] + - [16, 10834.0] + - - [2368, 3584, 1, 1280, 2368, 2368, 1280, 1280] + - [7, 11239.0] + - - [2368, 6784, 1, 1280, 2368, 2368, 1280, 1280] + - [18, 10536.0] + - - [2944, 3584, 1, 3328, 2944, 2944, 3328, 3328] + - [16, 9743.0] + - - [6784, 2944, 1, 256, 6784, 6784, 256, 256] + - [16, 11699.0] + - - [4288, 2368, 1, 3328, 4288, 4288, 3328, 3328] + - [58, 9749.0] + - - [1856, 2368, 1, 256, 1856, 1856, 256, 256] + - [35, 8769.0] + - - [3584, 6784, 1, 3328, 3584, 3584, 3328, 3328] + - [38, 11008.0] + - - [1024, 5888, 1, 3328, 1024, 1024, 3328, 3328] + - [52, 8860.0] + - - [6144, 24000, 1, 2560, 6144, 6144, 2560, 2560] + - [49, 11207.0] + - - [5056, 4288, 1, 1280, 5056, 5056, 1280, 1280] + - [60, 11430.0] + - - [6784, 1856, 1, 3328, 6784, 6784, 3328, 3328] + - [47, 9651.0] + - - [1408, 5056, 1, 1280, 1408, 1408, 1280, 1280] + - [38, 11698.0] + - - [2368, 2368, 1, 1280, 2368, 2368, 1280, 1280] + - [58, 10772.0] + - - [2944, 5888, 1, 128, 2944, 2944, 128, 128] + - [23, 11178.0] + - - [704, 5888, 1, 1280, 704, 704, 1280, 1280] + - [18, 10598.0] + - - [2368, 3584, 1, 128, 2368, 2368, 128, 128] + - [23, 10204.0] + - - [1856, 5056, 1, 128, 1856, 1856, 128, 128] + - [12, 10068.0] + - - [8192, 3200, 1, 2048, 8192, 8192, 2048, 2048] + - [48, 9040.0] + - - [1024, 5056, 1, 1280, 1024, 1024, 1280, 1280] + - [40, 11019.0] + - - [4288, 1024, 1, 256, 4288, 4288, 256, 256] + - [27, 10277.0] + - - [2944, 2368, 1, 128, 2944, 2944, 128, 128] + - [54, 10065.0] + - - [5888, 448, 1, 1280, 5888, 5888, 1280, 1280] + - [60, 8999.0] + - - [704, 5888, 1, 3328, 704, 704, 3328, 3328] + - [21, 8241.0] + - - [3584, 2944, 1, 256, 3584, 3584, 256, 256] + - [58, 10727.0] + - - [512, 24000, 1, 2048, 512, 512, 2048, 2048] + - [15, 8397.0] + - - [1408, 5056, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 8712.0] + - - [1856, 1856, 1, 3328, 1856, 1856, 3328, 3328] + - [16, 10303.0] + - - [2560, 800, 1, 2560, 2560, 2560, 2560, 2560] + - [38, 9976.0] + - - [2368, 2368, 1, 256, 2368, 2368, 256, 256] + - [27, 10109.0] + - - [4288, 4288, 1, 1280, 4288, 4288, 1280, 1280] + - [29, 11076.0] + - - [5888, 1024, 1, 1280, 5888, 5888, 1280, 1280] + - [49, 11395.0] + - - [1408, 4288, 1, 256, 1408, 1408, 256, 256] + - [48, 9223.0] + - - [5888, 448, 1, 128, 5888, 5888, 128, 128] + - [53, 8089.0] + - - [512, 48000, 1, 2560, 512, 512, 2560, 2560] + - [29, 10852.0] + - - [704, 6784, 1, 3328, 704, 704, 3328, 3328] + - [58, 7571.0] + - - [2560, 6400, 1, 2560, 2560, 2560, 2560, 2560] + - [58, 9684.0] + - - [5056, 1024, 1, 1280, 5056, 5056, 1280, 1280] + - [18, 11317.0] + - - [448, 5888, 1, 3328, 448, 448, 3328, 3328] + - [18, 9467.0] + - - [1024, 2944, 1, 1280, 1024, 1024, 1280, 1280] + - [16, 10620.0] + - - [5056, 5888, 1, 1280, 5056, 5056, 1280, 1280] + - [18, 10254.0] + - - [4288, 5888, 1, 128, 4288, 4288, 128, 128] + - [54, 11224.0] + - - [1408, 3584, 1, 128, 1408, 1408, 128, 128] + - [34, 10322.0] + - - [448, 3584, 1, 128, 448, 448, 128, 128] + - [28, 7862.0] + - - [5888, 2944, 1, 1280, 5888, 5888, 1280, 1280] + - [18, 9296.0] + - - [2368, 5888, 1, 128, 2368, 2368, 128, 128] + - [55, 9700.0] + - - [3584, 5888, 1, 256, 3584, 3584, 256, 256] + - [16, 11600.0] + - - [2368, 1024, 1, 128, 2368, 2368, 128, 128] + - [35, 8699.0] + - - [2368, 704, 1, 128, 2368, 2368, 128, 128] + - [53, 8114.0] + - - [3584, 2944, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 11417.0] + - - [3584, 2368, 1, 128, 3584, 3584, 128, 128] + - [53, 8948.0] + - - [5056, 704, 1, 128, 5056, 5056, 128, 128] + - [55, 8507.0] + - - [5056, 1408, 1, 3328, 5056, 5056, 3328, 3328] + - [16, 9063.0] + - - [6784, 1024, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 8653.0] + - - [6784, 2944, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 10461.0] + - - [2944, 5056, 1, 3328, 2944, 2944, 3328, 3328] + - [18, 10068.0] + - - [1856, 1856, 1, 256, 1856, 1856, 256, 256] + - [35, 8465.0] + - - [1024, 5888, 1, 128, 1024, 1024, 128, 128] + - [16, 9686.0] + - - [6784, 2368, 1, 1280, 6784, 6784, 1280, 1280] + - [60, 11111.0] + - - [4288, 5888, 1, 1280, 4288, 4288, 1280, 1280] + - [17, 10658.0] + - - [4288, 4288, 1, 256, 4288, 4288, 256, 256] + - [27, 11251.0] + - - [4288, 1856, 1, 1280, 4288, 4288, 1280, 1280] + - [16, 11198.0] + - - [1856, 2944, 1, 3328, 1856, 1856, 3328, 3328] + - [58, 10371.0] + - - [256, 6784, 1, 3328, 256, 256, 3328, 3328] + - [60, 10403.0] + - - [256, 5056, 1, 128, 256, 256, 128, 128] + - [0, 6886.0] + - - [5056, 1024, 1, 256, 5056, 5056, 256, 256] + - [38, 10821.0] + - - [5056, 1856, 1, 3328, 5056, 5056, 3328, 3328] + - [16, 9321.0] + - - [1856, 1408, 1, 256, 1856, 1856, 256, 256] + - [35, 7989.0] + - - [8448, 12000, 1, 2816, 8448, 8448, 2816, 2816] + - [7, 11770.0] + - - [4288, 1408, 1, 128, 4288, 4288, 128, 128] + - [35, 8532.0] + - - [1856, 5888, 1, 3328, 1856, 1856, 3328, 3328] + - [16, 9603.0] + - - [4288, 5056, 1, 256, 4288, 4288, 256, 256] + - [16, 11390.0] + - - [4096, 800, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9264.0] + - - [5056, 256, 1, 3328, 5056, 5056, 3328, 3328] + - [40, 10629.0] + - - [1024, 5888, 1, 1280, 1024, 1024, 1280, 1280] + - [5, 11177.0] + - - [6784, 2368, 1, 128, 6784, 6784, 128, 128] + - [34, 9330.0] + - - [1856, 1024, 1, 1280, 1856, 1856, 1280, 1280] + - [14, 10762.0] + - - [6784, 4288, 1, 1280, 6784, 6784, 1280, 1280] + - [60, 10444.0] + - - [1856, 1856, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 9897.0] + - - [4096, 400, 1, 1024, 4096, 4096, 1024, 1024] + - [38, 8004.0] + - - [3072, 24000, 1, 1024, 3072, 3072, 1024, 1024] + - [58, 11801.0] + - - [5888, 1856, 1, 128, 5888, 5888, 128, 128] + - [55, 9208.0] + - - [5056, 3584, 1, 128, 5056, 5056, 128, 128] + - [45, 11126.0] + - - [5888, 5888, 1, 3328, 5888, 5888, 3328, 3328] + - [18, 11461.0] + - - [6784, 1024, 1, 256, 6784, 6784, 256, 256] + - [13, 9853.0] + - - [2944, 2368, 1, 256, 2944, 2944, 256, 256] + - [35, 9651.0] + - - [5056, 5888, 1, 3328, 5056, 5056, 3328, 3328] + - [16, 11070.0] + - - [1856, 1024, 1, 256, 1856, 1856, 256, 256] + - [17, 8964.0] + - - [512, 48000, 1, 1536, 512, 512, 1536, 1536] + - [60, 11235.0] + - - [3584, 448, 1, 1280, 3584, 3584, 1280, 1280] + - [37, 8809.0] + - - [8448, 5984, 1, 2816, 8448, 8448, 2816, 2816] + - [40, 11253.0] + - - [448, 5888, 1, 256, 448, 448, 256, 256] + - [13, 7814.0] + - - [1408, 6784, 1, 3328, 1408, 1408, 3328, 3328] + - [17, 10591.0] + - - [4288, 704, 1, 128, 4288, 4288, 128, 128] + - [46, 6692.0] + - - [5056, 2944, 1, 256, 5056, 5056, 256, 256] + - [38, 11387.0] + - - [6784, 5888, 1, 128, 6784, 6784, 128, 128] + - [16, 11721.0] + - - [2944, 704, 1, 128, 2944, 2944, 128, 128] + - [53, 8525.0] + - - [1408, 3584, 1, 3328, 1408, 1408, 3328, 3328] + - [60, 10993.0] + - - [2368, 6784, 1, 256, 2368, 2368, 256, 256] + - [38, 11154.0] + - - [5056, 1408, 1, 1280, 5056, 5056, 1280, 1280] + - [16, 11689.0] + - - [5056, 4288, 1, 128, 5056, 5056, 128, 128] + - [27, 10898.0] + - - [1408, 1856, 1, 128, 1408, 1408, 128, 128] + - [46, 8973.0] + - - [1408, 5888, 1, 3328, 1408, 1408, 3328, 3328] + - [40, 9249.0] + - - [6784, 6784, 1, 256, 6784, 6784, 256, 256] + - [38, 11970.0] + - - [4288, 2368, 1, 128, 4288, 4288, 128, 128] + - [34, 10149.0] + - - [2368, 2944, 1, 256, 2368, 2368, 256, 256] + - [16, 10971.0] + - - [3584, 1856, 1, 1280, 3584, 3584, 1280, 1280] + - [16, 11365.0] + - - [6784, 6784, 1, 128, 6784, 6784, 128, 128] + - [23, 11769.0] + - - [5888, 5056, 1, 256, 5888, 5888, 256, 256] + - [58, 11702.0] + - - [8448, 48000, 1, 2816, 8448, 8448, 2816, 2816] + - [60, 12210.0] + - - [3584, 448, 1, 256, 3584, 3584, 256, 256] + - [57, 6545.0] + - - [448, 4288, 1, 128, 448, 448, 128, 128] + - [34, 7456.0] + - - [256, 6784, 1, 256, 256, 256, 256, 256] + - [16, 8643.0] + - - [1408, 4288, 1, 128, 1408, 1408, 128, 128] + - [34, 10008.0] + - - [2944, 704, 1, 3328, 2944, 2944, 3328, 3328] + - [16, 10433.0] + - - [5056, 256, 1, 1280, 5056, 5056, 1280, 1280] + - [36, 10831.0] + - - [3584, 3584, 1, 256, 3584, 3584, 256, 256] + - [58, 11457.0] + - - [3584, 5056, 1, 256, 3584, 3584, 256, 256] + - [38, 11504.0] + - - [2944, 2368, 1, 1280, 2944, 2944, 1280, 1280] + - [5, 11303.0] + - - [1408, 3584, 1, 256, 1408, 1408, 256, 256] + - [49, 10634.0] + - - [6784, 3584, 1, 256, 6784, 6784, 256, 256] + - [27, 11832.0] + - - [5056, 2368, 1, 128, 5056, 5056, 128, 128] + - [54, 10445.0] + - - [2944, 2944, 1, 3328, 2944, 2944, 3328, 3328] + - [25, 9896.0] + - - [5056, 6784, 1, 256, 5056, 5056, 256, 256] + - [38, 11715.0] + - - [1856, 3584, 1, 128, 1856, 1856, 128, 128] + - [35, 8552.0] + - - [6784, 448, 1, 256, 6784, 6784, 256, 256] + - [38, 8902.0] + - - [3584, 6784, 1, 128, 3584, 3584, 128, 128] + - [49, 11226.0] + - - [5056, 1856, 1, 256, 5056, 5056, 256, 256] + - [16, 10670.0] + - - [4608, 5984, 1, 1536, 4608, 4608, 1536, 1536] + - [38, 10169.0] + - - [1760, 3200, 1, 1760, 1760, 1760, 1760, 1760] + - [23, 11559.0] + - - [1024, 1856, 1, 256, 1024, 1024, 256, 256] + - [39, 8954.0] + - - [4096, 1600, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11118.0] + - - [1408, 6784, 1, 1280, 1408, 1408, 1280, 1280] + - [51, 11493.0] + - - [3584, 3584, 1, 1280, 3584, 3584, 1280, 1280] + - [14, 10889.0] + - - [7680, 24000, 1, 2560, 7680, 7680, 2560, 2560] + - [58, 12074.0] + - - [4608, 48000, 1, 1536, 4608, 4608, 1536, 1536] + - [40, 11958.0] + - - [5888, 5888, 1, 128, 5888, 5888, 128, 128] + - [16, 11644.0] + - - [5056, 2368, 1, 3328, 5056, 5056, 3328, 3328] + - [58, 11329.0] + - - [2944, 4288, 1, 256, 2944, 2944, 256, 256] + - [49, 11133.0] + - - [1408, 3584, 1, 1280, 1408, 1408, 1280, 1280] + - [7, 11309.0] + - - [8192, 1600, 1, 2048, 8192, 8192, 2048, 2048] + - [18, 9251.0] + - - [512, 24000, 1, 2560, 512, 512, 2560, 2560] + - [48, 8716.0] + - - [2368, 6784, 1, 3328, 2368, 2368, 3328, 3328] + - [27, 10324.0] + - - [1856, 1408, 1, 1280, 1856, 1856, 1280, 1280] + - [38, 10179.0] + - - [6784, 704, 1, 128, 6784, 6784, 128, 128] + - [46, 8974.0] + - - [1408, 5888, 1, 256, 1408, 1408, 256, 256] + - [38, 11065.0] + - - [704, 2944, 1, 1280, 704, 704, 1280, 1280] + - [58, 10072.0] + - - [704, 6784, 1, 128, 704, 704, 128, 128] + - [13, 9041.0] + - - [3584, 704, 1, 1280, 3584, 3584, 1280, 1280] + - [5, 9660.0] + - - [5888, 2368, 1, 256, 5888, 5888, 256, 256] + - [27, 11295.0] + - - [2944, 6784, 1, 128, 2944, 2944, 128, 128] + - [58, 11300.0] + - - [3584, 448, 1, 3328, 3584, 3584, 3328, 3328] + - [15, 9267.0] + - - [704, 2368, 1, 3328, 704, 704, 3328, 3328] + - [18, 9970.0] + - - [256, 5888, 1, 128, 256, 256, 128, 128] + - [28, 7467.0] + - - [2048, 3200, 1, 512, 2048, 2048, 512, 512] + - [16, 11050.0] + - - [2944, 2944, 1, 1280, 2944, 2944, 1280, 1280] + - [16, 11508.0] + - - [5056, 448, 1, 3328, 5056, 5056, 3328, 3328] + - [51, 9161.0] + - - [6784, 704, 1, 3328, 6784, 6784, 3328, 3328] + - [27, 10698.0] + - - [5888, 4288, 1, 128, 5888, 5888, 128, 128] + - [49, 11143.0] + - - [1408, 2944, 1, 3328, 1408, 1408, 3328, 3328] + - [58, 11151.0] + - - [3584, 704, 1, 128, 3584, 3584, 128, 128] + - [53, 8256.0] + - - [4608, 12000, 1, 1536, 4608, 4608, 1536, 1536] + - [40, 11306.0] + - - [5056, 5056, 1, 128, 5056, 5056, 128, 128] + - [58, 11020.0] + - - [8192, 800, 1, 2048, 8192, 8192, 2048, 2048] + - [50, 8830.0] + - - [448, 5056, 1, 128, 448, 448, 128, 128] + - [55, 6437.0] + - - [5056, 3584, 1, 256, 5056, 5056, 256, 256] + - [49, 11474.0] + - - [1408, 5056, 1, 128, 1408, 1408, 128, 128] + - [58, 10131.0] + - - [2944, 3584, 1, 128, 2944, 2944, 128, 128] + - [54, 11136.0] + - - [3584, 2368, 1, 256, 3584, 3584, 256, 256] + - [38, 11078.0] + - - [8448, 24000, 1, 2816, 8448, 8448, 2816, 2816] + - [18, 12083.0] + - - [3584, 3584, 1, 3328, 3584, 3584, 3328, 3328] + - [16, 10109.0] + - - [5888, 6784, 1, 256, 5888, 5888, 256, 256] + - [58, 11954.0] + - - [4288, 2944, 1, 3328, 4288, 4288, 3328, 3328] + - [38, 10227.0] + - - [256, 5056, 1, 1280, 256, 256, 1280, 1280] + - [14, 8892.0] + - - [2944, 5888, 1, 3328, 2944, 2944, 3328, 3328] + - [16, 10737.0] + - - [6784, 5888, 1, 1280, 6784, 6784, 1280, 1280] + - [16, 10810.0] + - - [2048, 800, 1, 512, 2048, 2048, 512, 512] + - [13, 7814.0] + - - [5888, 4288, 1, 1280, 5888, 5888, 1280, 1280] + - [18, 10063.0] + - - [1024, 24000, 1, 2048, 1024, 1024, 2048, 2048] + - [38, 9336.0] + - - [5888, 3584, 1, 128, 5888, 5888, 128, 128] + - [58, 11147.0] + - - [1024, 2944, 1, 128, 1024, 1024, 128, 128] + - [11, 8830.0] + - - [704, 3584, 1, 128, 704, 704, 128, 128] + - [12, 8715.0] + - - [5888, 448, 1, 3328, 5888, 5888, 3328, 3328] + - [25, 8444.0] + - - [2368, 4288, 1, 1280, 2368, 2368, 1280, 1280] + - [60, 11149.0] + - - [4288, 2944, 1, 128, 4288, 4288, 128, 128] + - [13, 9566.0] + - - [1024, 6784, 1, 3328, 1024, 1024, 3328, 3328] + - [16, 11522.0] + - - [5056, 2944, 1, 3328, 5056, 5056, 3328, 3328] + - [59, 10401.0] + - - [2944, 3584, 1, 256, 2944, 2944, 256, 256] + - [27, 10381.0] + - - [1408, 1408, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 10119.0] + - - [3584, 3584, 1, 128, 3584, 3584, 128, 128] + - [49, 10382.0] + - - [3584, 704, 1, 256, 3584, 3584, 256, 256] + - [57, 8686.0] + - - [3584, 1408, 1, 3328, 3584, 3584, 3328, 3328] + - [49, 10341.0] + - - [704, 3584, 1, 1280, 704, 704, 1280, 1280] + - [38, 9824.0] + - - [2944, 6784, 1, 1280, 2944, 2944, 1280, 1280] + - [40, 11742.0] + - - [1856, 6784, 1, 256, 1856, 1856, 256, 256] + - [5, 10215.0] + - - [4288, 448, 1, 3328, 4288, 4288, 3328, 3328] + - [16, 9566.0] + - - [6784, 4288, 1, 128, 6784, 6784, 128, 128] + - [49, 11152.0] + - - [6784, 704, 1, 1280, 6784, 6784, 1280, 1280] + - [5, 10686.0] + - - [3584, 6784, 1, 256, 3584, 3584, 256, 256] + - [38, 11733.0] + - - [6144, 12000, 1, 2048, 6144, 6144, 2048, 2048] + - [38, 11767.0] + - - [5888, 1024, 1, 3328, 5888, 5888, 3328, 3328] + - [27, 8319.0] + - - [704, 6784, 1, 1280, 704, 704, 1280, 1280] + - [5, 10313.0] + - - [1856, 5056, 1, 3328, 1856, 1856, 3328, 3328] + - [58, 9414.0] + - - [1024, 3584, 1, 128, 1024, 1024, 128, 128] + - [11, 7981.0] + - - [1024, 1408, 1, 128, 1024, 1024, 128, 128] + - [28, 7364.0] + - - [2368, 2944, 1, 128, 2368, 2368, 128, 128] + - [46, 9981.0] + - - [5056, 2944, 1, 128, 5056, 5056, 128, 128] + - [54, 11151.0] + - - [5888, 5056, 1, 3328, 5888, 5888, 3328, 3328] + - [49, 11651.0] + - - [5888, 2368, 1, 128, 5888, 5888, 128, 128] + - [13, 9199.0] + - - [3584, 6784, 1, 1280, 3584, 3584, 1280, 1280] + - [6, 10314.0] + - - [1856, 5888, 1, 256, 1856, 1856, 256, 256] + - [27, 9945.0] + - - [4288, 4288, 1, 3328, 4288, 4288, 3328, 3328] + - [58, 10722.0] + - - [4288, 1408, 1, 1280, 4288, 4288, 1280, 1280] + - [16, 11139.0] + - - [3584, 5056, 1, 128, 3584, 3584, 128, 128] + - [58, 10798.0] + - - [4288, 2368, 1, 256, 4288, 4288, 256, 256] + - [58, 10875.0] + - - [2944, 5056, 1, 1280, 2944, 2944, 1280, 1280] + - [20, 9626.0] + - - [448, 6784, 1, 256, 448, 448, 256, 256] + - [55, 8195.0] + - - [1856, 2368, 1, 128, 1856, 1856, 128, 128] + - [54, 8981.0] + - - [6784, 2368, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 10037.0] + - - [4288, 1856, 1, 3328, 4288, 4288, 3328, 3328] + - [27, 11107.0] + - - [3584, 448, 1, 128, 3584, 3584, 128, 128] + - [53, 6172.0] + - - [2048, 1600, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 10665.0] + - - [3584, 1024, 1, 1280, 3584, 3584, 1280, 1280] + - [60, 10846.0] + - - [1856, 5056, 1, 256, 1856, 1856, 256, 256] + - [38, 10605.0] + - - [1024, 4288, 1, 256, 1024, 1024, 256, 256] + - [39, 9576.0] + - - [5888, 3584, 1, 3328, 5888, 5888, 3328, 3328] + - [58, 10745.0] + - - [5056, 3584, 1, 3328, 5056, 5056, 3328, 3328] + - [18, 11972.0] + - - [2368, 1408, 1, 1280, 2368, 2368, 1280, 1280] + - [38, 10271.0] + - - [5056, 2944, 1, 1280, 5056, 5056, 1280, 1280] + - [13, 8991.0] + - - [1024, 6784, 1, 256, 1024, 1024, 256, 256] + - [38, 9614.0] + - - [5124, 9124, 1, 2048, 5124, 5124, 2048, 2048] + - [58, 10652.0] + - - [2944, 1408, 1, 128, 2944, 2944, 128, 128] + - [55, 8727.0] + - - [3584, 1408, 1, 1280, 3584, 3584, 1280, 1280] + - [16, 11330.0] + - - [5056, 6784, 1, 3328, 5056, 5056, 3328, 3328] + - [16, 11280.0] + - - [3584, 4288, 1, 256, 3584, 3584, 256, 256] + - [38, 11303.0] + - - [1856, 6784, 1, 3328, 1856, 1856, 3328, 3328] + - [56, 10376.0] + - - [5888, 4288, 1, 256, 5888, 5888, 256, 256] + - [16, 11534.0] + - - [5056, 1408, 1, 256, 5056, 5056, 256, 256] + - [38, 10827.0] + - - [3584, 1024, 1, 256, 3584, 3584, 256, 256] + - [16, 10432.0] + - - [5888, 5888, 1, 256, 5888, 5888, 256, 256] + - [60, 11900.0] + - - [4288, 1024, 1, 1280, 4288, 4288, 1280, 1280] + - [18, 10821.0] + - - [448, 6784, 1, 3328, 448, 448, 3328, 3328] + - [48, 8219.0] + - - [2944, 1408, 1, 1280, 2944, 2944, 1280, 1280] + - [58, 11361.0] + - - [2944, 1856, 1, 3328, 2944, 2944, 3328, 3328] + - [16, 10590.0] + - - [3584, 5888, 1, 1280, 3584, 3584, 1280, 1280] + - [60, 11772.0] + - - [6784, 1856, 1, 1280, 6784, 6784, 1280, 1280] + - [28, 10130.0] + - - [2944, 5056, 1, 256, 2944, 2944, 256, 256] + - [58, 11355.0] + - - [5888, 256, 1, 3328, 5888, 5888, 3328, 3328] + - [49, 8434.0] + - - [2944, 4288, 1, 128, 2944, 2944, 128, 128] + - [55, 9463.0] + - - [3584, 1408, 1, 256, 3584, 3584, 256, 256] + - [38, 10539.0] + - - [704, 3584, 1, 3328, 704, 704, 3328, 3328] + - [16, 9993.0] + - - [4096, 3200, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11736.0] + - - [5056, 448, 1, 1280, 5056, 5056, 1280, 1280] + - [60, 9649.0] + - - [3584, 1856, 1, 3328, 3584, 3584, 3328, 3328] + - [27, 9595.0] + - - [4288, 6784, 1, 1280, 4288, 4288, 1280, 1280] + - [29, 11258.0] + - - [2560, 7000, 1, 2560, 2560, 2560, 2560, 2560] + - [50, 10020.0] + - - [2944, 1024, 1, 256, 2944, 2944, 256, 256] + - [35, 8754.0] + - - [2368, 4288, 1, 3328, 2368, 2368, 3328, 3328] + - [16, 9766.0] + - - [1024, 1408, 1, 1280, 1024, 1024, 1280, 1280] + - [16, 9003.0] + - - [6784, 5056, 1, 256, 6784, 6784, 256, 256] + - [16, 11759.0] + - - [1856, 1856, 1, 128, 1856, 1856, 128, 128] + - [33, 8940.0] + - - [3584, 5056, 1, 3328, 3584, 3584, 3328, 3328] + - [38, 10640.0] + - - [448, 6784, 1, 128, 448, 448, 128, 128] + - [22, 6691.0] + - - [2944, 6784, 1, 256, 2944, 2944, 256, 256] + - [16, 11703.0] + - - [2944, 2944, 1, 128, 2944, 2944, 128, 128] + - [58, 10637.0] + - - [1856, 3584, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 11510.0] + - - [4288, 448, 1, 128, 4288, 4288, 128, 128] + - [53, 8009.0] + - - [4608, 24000, 1, 1536, 4608, 4608, 1536, 1536] + - [49, 11866.0] + - - [1856, 1408, 1, 3328, 1856, 1856, 3328, 3328] + - [38, 10221.0] + - - [1024, 4288, 1, 3328, 1024, 1024, 3328, 3328] + - [29, 10329.0] + - - [5056, 448, 1, 256, 5056, 5056, 256, 256] + - [58, 8392.0] + - - [2944, 2368, 1, 3328, 2944, 2944, 3328, 3328] + - [27, 9576.0] + - - [704, 4288, 1, 3328, 704, 704, 3328, 3328] + - [27, 9536.0] + - - [1024, 1856, 1, 1280, 1024, 1024, 1280, 1280] + - [16, 10408.0] + - - [2048, 6400, 1, 2048, 2048, 2048, 2048, 2048] + - [62, 7303.0] + - - [512, 48000, 1, 2816, 512, 512, 2816, 2816] + - [58, 10107.0] + - - [5124, 9124, 1, 2560, 5124, 5124, 2560, 2560] + - [60, 10791.0] + - - [1024, 5888, 1, 256, 1024, 1024, 256, 256] + - [13, 9480.0] + - - [1408, 2368, 1, 256, 1408, 1408, 256, 256] + - [15, 8832.0] + - - [1408, 1408, 1, 256, 1408, 1408, 256, 256] + - [35, 9037.0] + - - [2368, 2368, 1, 128, 2368, 2368, 128, 128] + - [54, 9580.0] + - - [6784, 1408, 1, 128, 6784, 6784, 128, 128] + - [49, 10701.0] + - - [4288, 5888, 1, 256, 4288, 4288, 256, 256] + - [58, 11512.0] + - - [1408, 5056, 1, 256, 1408, 1408, 256, 256] + - [38, 10980.0] + - - [4288, 3584, 1, 128, 4288, 4288, 128, 128] + - [54, 11097.0] + - - [3584, 5056, 1, 1280, 3584, 3584, 1280, 1280] + - [40, 9185.0] + - - [1856, 1024, 1, 128, 1856, 1856, 128, 128] + - [53, 6016.0] + - - [1024, 24000, 1, 1536, 1024, 1024, 1536, 1536] + - [16, 9128.0] + - - [704, 4288, 1, 256, 704, 704, 256, 256] + - [55, 8387.0] + - - [5888, 2368, 1, 1280, 5888, 5888, 1280, 1280] + - [27, 9012.0] + - - [2368, 5888, 1, 1280, 2368, 2368, 1280, 1280] + - [36, 10761.0] + - - [5888, 256, 1, 1280, 5888, 5888, 1280, 1280] + - [16, 10146.0] + - - [2368, 1856, 1, 3328, 2368, 2368, 3328, 3328] + - [27, 10603.0] + - - [2944, 704, 1, 256, 2944, 2944, 256, 256] + - [55, 8718.0] + - - [2368, 1024, 1, 3328, 2368, 2368, 3328, 3328] + - [7, 11039.0] + - - [704, 3584, 1, 256, 704, 704, 256, 256] + - [34, 8786.0] + - - [704, 2944, 1, 3328, 704, 704, 3328, 3328] + - [5, 10204.0] + - - [6784, 1024, 1, 128, 6784, 6784, 128, 128] + - [58, 10256.0] + - - [2944, 1024, 1, 3328, 2944, 2944, 3328, 3328] + - [7, 11060.0] + - - [2944, 5056, 1, 128, 2944, 2944, 128, 128] + - [54, 10745.0] + - - [1408, 6784, 1, 256, 1408, 1408, 256, 256] + - [38, 11364.0] + - - [6784, 1408, 1, 3328, 6784, 6784, 3328, 3328] + - [56, 9496.0] + - - [4288, 6784, 1, 128, 4288, 4288, 128, 128] + - [45, 11438.0] + - - [6784, 2944, 1, 1280, 6784, 6784, 1280, 1280] + - [16, 9617.0] + - - [4288, 1856, 1, 128, 4288, 4288, 128, 128] + - [13, 8837.0] + - - [1856, 2944, 1, 128, 1856, 1856, 128, 128] + - [2, 9485.0] + - - [6784, 448, 1, 128, 6784, 6784, 128, 128] + - [48, 8370.0] + - - [448, 5056, 1, 1280, 448, 448, 1280, 1280] + - [7, 9916.0] + - - [2368, 1856, 1, 128, 2368, 2368, 128, 128] + - [34, 9050.0] + - - [4288, 704, 1, 256, 4288, 4288, 256, 256] + - [16, 8895.0] + - - [5888, 704, 1, 256, 5888, 5888, 256, 256] + - [16, 9541.0] + - - [3584, 1024, 1, 128, 3584, 3584, 128, 128] + - [35, 9100.0] + - - [256, 5888, 1, 3328, 256, 256, 3328, 3328] + - [20, 8043.0] + - - [1408, 4288, 1, 3328, 1408, 1408, 3328, 3328] + - [62, 9756.0] + - - [6784, 4288, 1, 256, 6784, 6784, 256, 256] + - [16, 11694.0] + - - [5888, 256, 1, 256, 5888, 5888, 256, 256] + - [55, 7626.0] + - - [6784, 1024, 1, 1280, 6784, 6784, 1280, 1280] + - [18, 11749.0] + - - [5888, 1024, 1, 128, 5888, 5888, 128, 128] + - [58, 10041.0] + - - [6784, 3584, 1, 1280, 6784, 6784, 1280, 1280] + - [60, 10452.0] + - - [1024, 6784, 1, 1280, 1024, 1024, 1280, 1280] + - [27, 11398.0] + - - [1408, 2944, 1, 1280, 1408, 1408, 1280, 1280] + - [58, 11307.0] + - - [2048, 800, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 8891.0] + - - [1408, 2368, 1, 3328, 1408, 1408, 3328, 3328] + - [16, 10849.0] + - - [2944, 1856, 1, 128, 2944, 2944, 128, 128] + - [54, 9557.0] + - - [256, 6784, 1, 128, 256, 256, 128, 128] + - [59, 8221.0] + - - [5056, 6784, 1, 128, 5056, 5056, 128, 128] + - [23, 11451.0] + - - [4288, 5056, 1, 128, 4288, 4288, 128, 128] + - [54, 11122.0] + - - [1856, 5888, 1, 128, 1856, 1856, 128, 128] + - [23, 10925.0] + - - [2944, 5888, 1, 256, 2944, 2944, 256, 256] + - [27, 11804.0] + - - [3584, 1856, 1, 256, 3584, 3584, 256, 256] + - [58, 10672.0] + - - [4288, 3584, 1, 1280, 4288, 4288, 1280, 1280] + - [40, 9380.0] + - - [704, 5888, 1, 128, 704, 704, 128, 128] + - [53, 7979.0] + - - [6784, 3584, 1, 128, 6784, 6784, 128, 128] + - [23, 11482.0] + - - [4288, 5056, 1, 3328, 4288, 4288, 3328, 3328] + - [18, 10775.0] + - - [1408, 1408, 1, 128, 1408, 1408, 128, 128] + - [53, 6807.0] + - - [5056, 2368, 1, 256, 5056, 5056, 256, 256] + - [38, 10903.0] + - - [4288, 704, 1, 3328, 4288, 4288, 3328, 3328] + - [38, 9695.0] + - - [448, 3584, 1, 256, 448, 448, 256, 256] + - [58, 7657.0] + - - [2368, 1024, 1, 1280, 2368, 2368, 1280, 1280] + - [60, 10643.0] + - - [2944, 1408, 1, 3328, 2944, 2944, 3328, 3328] + - [58, 11228.0] + - - [1024, 1408, 1, 3328, 1024, 1024, 3328, 3328] + - [58, 9941.0] + - - [2944, 5888, 1, 1280, 2944, 2944, 1280, 1280] + - [51, 11934.0] + - - [5888, 3584, 1, 256, 5888, 5888, 256, 256] + - [38, 11619.0] + - - [2368, 5056, 1, 128, 2368, 2368, 128, 128] + - [54, 10328.0] + - - [1408, 1856, 1, 3328, 1408, 1408, 3328, 3328] + - [38, 10306.0] + - - [6784, 1408, 1, 1280, 6784, 6784, 1280, 1280] + - [38, 11523.0] + - - [4096, 7000, 1, 4096, 4096, 4096, 4096, 4096] + - [16, 10300.0] + - - [704, 2944, 1, 256, 704, 704, 256, 256] + - [35, 8020.0] + - - [6784, 5888, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 11323.0] + - - [2368, 4288, 1, 128, 2368, 2368, 128, 128] + - [45, 10286.0] + - - [1024, 6784, 1, 128, 1024, 1024, 128, 128] + - [5, 10139.0] + - - [1408, 1408, 1, 1280, 1408, 1408, 1280, 1280] + - [38, 9950.0] + - - [16384, 400, 1, 4096, 16384, 16384, 4096, 4096] + - [60, 5797.0] + - - [448, 4288, 1, 3328, 448, 448, 3328, 3328] + - [58, 9451.0] + - - [2368, 1408, 1, 256, 2368, 2368, 256, 256] + - [49, 9482.0] + - - [5888, 5056, 1, 128, 5888, 5888, 128, 128] + - [58, 11156.0] + - - [704, 2368, 1, 256, 704, 704, 256, 256] + - [13, 8322.0] + - - [1024, 24000, 1, 2560, 1024, 1024, 2560, 2560] + - [7, 11367.0] + - - [5888, 2368, 1, 3328, 5888, 5888, 3328, 3328] + - [17, 10271.0] + - - [5124, 9124, 1, 1760, 5124, 5124, 1760, 1760] + - [23, 11450.0] + - - [4288, 448, 1, 1280, 4288, 4288, 1280, 1280] + - [49, 8925.0] + - - [5888, 704, 1, 3328, 5888, 5888, 3328, 3328] + - [18, 10967.0] + - - [5056, 256, 1, 128, 5056, 5056, 128, 128] + - [55, 5896.0] + - - [1408, 5888, 1, 128, 1408, 1408, 128, 128] + - [34, 10574.0] + - - [7680, 12000, 1, 2560, 7680, 7680, 2560, 2560] + - [38, 11665.0] + - - [1408, 1024, 1, 256, 1408, 1408, 256, 256] + - [28, 6708.0] + - - [8192, 400, 1, 2048, 8192, 8192, 2048, 2048] + - [27, 8136.0] + - - [1024, 1856, 1, 128, 1024, 1024, 128, 128] + - [35, 7733.0] + - - [5056, 6784, 1, 1280, 5056, 5056, 1280, 1280] + - [27, 10243.0] + - - [704, 5056, 1, 3328, 704, 704, 3328, 3328] + - [16, 8908.0] + - - [2368, 2944, 1, 3328, 2368, 2368, 3328, 3328] + - [27, 9763.0] + - - [2368, 3584, 1, 256, 2368, 2368, 256, 256] + - [49, 9936.0] + - - [5056, 3584, 1, 1280, 5056, 5056, 1280, 1280] + - [49, 11545.0] + - - [5124, 9124, 1, 4096, 5124, 5124, 4096, 4096] + - [16, 10633.0] + - - [7680, 48000, 1, 2560, 7680, 7680, 2560, 2560] + - [58, 11947.0] + - - [1856, 2944, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 10726.0] + - - [1024, 48000, 1, 2816, 1024, 1024, 2816, 2816] + - [18, 11470.0] + - - [2944, 1408, 1, 256, 2944, 2944, 256, 256] + - [35, 9366.0] + - - [4288, 1408, 1, 3328, 4288, 4288, 3328, 3328] + - [16, 9498.0] + - - [5888, 2944, 1, 128, 5888, 5888, 128, 128] + - [54, 9860.0] + - - [2944, 1024, 1, 128, 2944, 2944, 128, 128] + - [45, 9024.0] + - - [4288, 5056, 1, 1280, 4288, 4288, 1280, 1280] + - [51, 10805.0] + - - [5888, 6784, 1, 1280, 5888, 5888, 1280, 1280] + - [18, 10676.0] + - - [6784, 5056, 1, 128, 6784, 6784, 128, 128] + - [58, 11448.0] + - - [1760, 1600, 1, 1760, 1760, 1760, 1760, 1760] + - [1, 10509.0] + - - [5888, 1408, 1, 3328, 5888, 5888, 3328, 3328] + - [40, 10720.0] + - - [2368, 1856, 1, 256, 2368, 2368, 256, 256] + - [13, 8848.0] + - - [256, 5056, 1, 256, 256, 256, 256, 256] + - [55, 8406.0] + - - [448, 3584, 1, 3328, 448, 448, 3328, 3328] + - [18, 9714.0] + - - [704, 2368, 1, 128, 704, 704, 128, 128] + - [46, 7389.0] + - - [5888, 256, 1, 128, 5888, 5888, 128, 128] + - [50, 7761.0] + - - [3584, 1856, 1, 128, 3584, 3584, 128, 128] + - [38, 9580.0] + - - [4288, 4288, 1, 128, 4288, 4288, 128, 128] + - [34, 9915.0] + - - [1856, 1024, 1, 3328, 1856, 1856, 3328, 3328] + - [18, 11081.0] + - - [1024, 5056, 1, 256, 1024, 1024, 256, 256] + - [38, 10363.0] + - - [5888, 5888, 1, 1280, 5888, 5888, 1280, 1280] + - [18, 11123.0] + - - [5056, 5888, 1, 128, 5056, 5056, 128, 128] + - [49, 11080.0] + - - [2368, 1408, 1, 3328, 2368, 2368, 3328, 3328] + - [38, 10753.0] + - - [1024, 48000, 1, 1536, 1024, 1024, 1536, 1536] + - [16, 10543.0] + - - [5888, 448, 1, 256, 5888, 5888, 256, 256] + - [55, 7350.0] + - - [2560, 3200, 1, 2560, 2560, 2560, 2560, 2560] + - [60, 10161.0] + - - [5888, 6784, 1, 128, 5888, 5888, 128, 128] + - [38, 11736.0] + - - [6144, 48000, 1, 2048, 6144, 6144, 2048, 2048] + - [18, 11807.0] + - - [6784, 5056, 1, 1280, 6784, 6784, 1280, 1280] + - [60, 10646.0] + - - [5056, 704, 1, 1280, 5056, 5056, 1280, 1280] + - [18, 10147.0] + - - [1024, 48000, 1, 2560, 1024, 1024, 2560, 2560] + - [16, 11189.0] + - - [1024, 2368, 1, 128, 1024, 1024, 128, 128] + - [53, 7099.0] + - - [16384, 800, 1, 4096, 16384, 16384, 4096, 4096] + - [59, 8976.0] + - - [5888, 5056, 1, 1280, 5888, 5888, 1280, 1280] + - [18, 10439.0] + - - [3072, 48000, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 11354.0] + - - [6784, 1408, 1, 256, 6784, 6784, 256, 256] + - [38, 10337.0] + - - [3584, 5888, 1, 128, 3584, 3584, 128, 128] + - [38, 6229.0] + - - [5056, 5888, 1, 256, 5056, 5056, 256, 256] + - [38, 11625.0] + - - [2368, 1024, 1, 256, 2368, 2368, 256, 256] + - [12, 9360.0] + - - [2944, 1856, 1, 256, 2944, 2944, 256, 256] + - [16, 10517.0] + - - [1856, 6784, 1, 1280, 1856, 1856, 1280, 1280] + - [16, 8443.0] + - - [4288, 3584, 1, 256, 4288, 4288, 256, 256] + - [27, 11345.0] + - - [6784, 448, 1, 3328, 6784, 6784, 3328, 3328] + - [54, 7800.0] + - - [5056, 1856, 1, 1280, 5056, 5056, 1280, 1280] + - [16, 11143.0] + - - [1408, 1024, 1, 3328, 1408, 1408, 3328, 3328] + - [49, 9907.0] + - - [5888, 3584, 1, 1280, 5888, 5888, 1280, 1280] + - [60, 10114.0] + - - [1856, 3584, 1, 3328, 1856, 1856, 3328, 3328] + - [60, 9253.0] + - - [1024, 2944, 1, 256, 1024, 1024, 256, 256] + - [13, 8409.0] + - - [448, 6784, 1, 1280, 448, 448, 1280, 1280] + - [38, 9804.0] + - - [704, 5056, 1, 256, 704, 704, 256, 256] + - [34, 9346.0] + - - [3584, 1024, 1, 3328, 3584, 3584, 3328, 3328] + - [27, 11006.0] + - - [2944, 1856, 1, 1280, 2944, 2944, 1280, 1280] + - [38, 10704.0] + - - [5056, 256, 1, 256, 5056, 5056, 256, 256] + - [35, 8761.0] + - - [2944, 4288, 1, 3328, 2944, 2944, 3328, 3328] + - [60, 10150.0] + - - [2368, 3584, 1, 3328, 2368, 2368, 3328, 3328] + - [50, 10133.0] + - - [2944, 704, 1, 1280, 2944, 2944, 1280, 1280] + - [58, 9511.0] + - - [2944, 3584, 1, 1280, 2944, 2944, 1280, 1280] + - [40, 11345.0] + - - [1856, 5888, 1, 1280, 1856, 1856, 1280, 1280] + - [60, 10921.0] + - - [2048, 3200, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 11011.0] + - - [4288, 1408, 1, 256, 4288, 4288, 256, 256] + - [13, 9644.0] + - - [5888, 1408, 1, 128, 5888, 5888, 128, 128] + - [49, 10453.0] + - - [4288, 2368, 1, 1280, 4288, 4288, 1280, 1280] + - [58, 11138.0] + - - [6784, 2368, 1, 256, 6784, 6784, 256, 256] + - [16, 11201.0] + - - [1024, 24000, 1, 2816, 1024, 1024, 2816, 2816] + - [60, 12052.0] + - - [7680, 5984, 1, 2560, 7680, 7680, 2560, 2560] + - [38, 11130.0] + - - [4288, 1856, 1, 256, 4288, 4288, 256, 256] + - [13, 9594.0] + - - [1856, 2944, 1, 256, 1856, 1856, 256, 256] + - [49, 10132.0] + - - [5056, 1024, 1, 128, 5056, 5056, 128, 128] + - [35, 9841.0] + - - [1760, 800, 1, 1760, 1760, 1760, 1760, 1760] + - [1, 9949.0] + - - [6784, 256, 1, 128, 6784, 6784, 128, 128] + - [28, 7861.0] + - - [5888, 704, 1, 128, 5888, 5888, 128, 128] + - [54, 9132.0] + - - [1408, 2368, 1, 128, 1408, 1408, 128, 128] + - [23, 9123.0] + - - [1024, 4288, 1, 1280, 1024, 1024, 1280, 1280] + - [18, 11129.0] + - - [2368, 5056, 1, 3328, 2368, 2368, 3328, 3328] + - [18, 9785.0] + - - [4288, 1024, 1, 3328, 4288, 4288, 3328, 3328] + - [40, 10238.0] + - - [6144, 48000, 1, 2560, 6144, 6144, 2560, 2560] + - [29, 12256.0] + - - [1024, 5056, 1, 3328, 1024, 1024, 3328, 3328] + - [16, 8432.0] + - - [1024, 1856, 1, 3328, 1024, 1024, 3328, 3328] + - [58, 10780.0] + - - [4288, 6784, 1, 256, 4288, 4288, 256, 256] + - [16, 11651.0] + - - [3584, 2944, 1, 3328, 3584, 3584, 3328, 3328] + - [51, 11080.0] + - - [5888, 2944, 1, 256, 5888, 5888, 256, 256] + - [38, 11609.0] + - - [448, 4288, 1, 1280, 448, 448, 1280, 1280] + - [5, 9464.0] + - - [1024, 4288, 1, 128, 1024, 1024, 128, 128] + - [17, 9024.0] + - - [5056, 4288, 1, 256, 5056, 5056, 256, 256] + - [16, 11418.0] + - - [1024, 3584, 1, 256, 1024, 1024, 256, 256] + - [16, 10179.0] + - - [6784, 6784, 1, 3328, 6784, 6784, 3328, 3328] + - [58, 11493.0] + - - [448, 5888, 1, 1280, 448, 448, 1280, 1280] + - [7, 9065.0] + - - [5056, 448, 1, 128, 5056, 5056, 128, 128] + - [53, 8036.0] + - - [4288, 704, 1, 1280, 4288, 4288, 1280, 1280] + - [58, 9790.0] + - - [3584, 2944, 1, 128, 3584, 3584, 128, 128] + - [16, 10387.0] + - - [6784, 256, 1, 1280, 6784, 6784, 1280, 1280] + - [38, 9927.0] + - - [2368, 5888, 1, 3328, 2368, 2368, 3328, 3328] + - [18, 10251.0] + - - [2368, 1856, 1, 1280, 2368, 2368, 1280, 1280] + - [58, 10529.0] + - - [448, 5056, 1, 3328, 448, 448, 3328, 3328] + - [16, 9047.0] + - - [3584, 4288, 1, 128, 3584, 3584, 128, 128] + - [38, 10399.0] + - - [5888, 4288, 1, 3328, 5888, 5888, 3328, 3328] + - [18, 10913.0] + - - [2368, 704, 1, 256, 2368, 2368, 256, 256] + - [13, 6935.0] + - - [3584, 1408, 1, 128, 3584, 3584, 128, 128] + - [5, 9772.0] + - - [1856, 5056, 1, 1280, 1856, 1856, 1280, 1280] + - [38, 11379.0] + - - [2944, 1024, 1, 1280, 2944, 2944, 1280, 1280] + - [51, 10815.0] + - - [3584, 5888, 1, 3328, 3584, 3584, 3328, 3328] + - [58, 11035.0] + - - [2368, 4288, 1, 256, 2368, 2368, 256, 256] + - [49, 10756.0] + - - [1024, 2368, 1, 3328, 1024, 1024, 3328, 3328] + - [60, 10908.0] + - - [1024, 3584, 1, 1280, 1024, 1024, 1280, 1280] + - [18, 10575.0] + - - [4288, 5888, 1, 3328, 4288, 4288, 3328, 3328] + - [58, 11058.0] + - - [1024, 2944, 1, 3328, 1024, 1024, 3328, 3328] + - [51, 10860.0] + - - [6784, 1856, 1, 256, 6784, 6784, 256, 256] + - [49, 10560.0] + - - [256, 6784, 1, 1280, 256, 256, 1280, 1280] + - [5, 10021.0] + - - [1856, 3584, 1, 256, 1856, 1856, 256, 256] + - [49, 10153.0] + - - [6784, 1856, 1, 128, 6784, 6784, 128, 128] + - [49, 10588.0] + - - [512, 24000, 1, 2816, 512, 512, 2816, 2816] + - [16, 8690.0] + - - [256, 5888, 1, 1280, 256, 256, 1280, 1280] + - [38, 9365.0] + - - [16384, 1600, 1, 4096, 16384, 16384, 4096, 4096] + - [40, 10327.0] + - - [2368, 1408, 1, 128, 2368, 2368, 128, 128] + - [33, 7348.0] + - - [1408, 1024, 1, 128, 1408, 1408, 128, 128] + - [46, 7658.0] + - - [6784, 3584, 1, 3328, 6784, 6784, 3328, 3328] + - [38, 10742.0] + - - [1760, 7000, 1, 1760, 1760, 1760, 1760, 1760] + - [25, 11472.0] + - - [2368, 5056, 1, 1280, 2368, 2368, 1280, 1280] + - [29, 10522.0] + - - [1408, 2368, 1, 1280, 1408, 1408, 1280, 1280] + - [16, 10725.0] + - - [704, 4288, 1, 128, 704, 704, 128, 128] + - [24, 8173.0] + - - [2944, 2944, 1, 256, 2944, 2944, 256, 256] + - [27, 11405.0] + - - [6784, 256, 1, 256, 6784, 6784, 256, 256] + - [58, 8731.0] + - - [256, 5056, 1, 3328, 256, 256, 3328, 3328] + - [5, 10110.0] + - - [5056, 1856, 1, 128, 5056, 5056, 128, 128] + - [23, 9745.0] + - - [5056, 1024, 1, 3328, 5056, 5056, 3328, 3328] + - [42, 9670.0] + - - [4288, 3584, 1, 3328, 4288, 4288, 3328, 3328] + - [16, 10493.0] + - - [1024, 2368, 1, 1280, 1024, 1024, 1280, 1280] + - [38, 9800.0] + - - [5888, 6784, 1, 3328, 5888, 5888, 3328, 3328] + - [38, 11778.0] + - - [704, 4288, 1, 1280, 704, 704, 1280, 1280] + - [38, 9337.0] + - - [1024, 48000, 1, 2048, 1024, 1024, 2048, 2048] + - [38, 10643.0] + - - [4288, 1024, 1, 128, 4288, 4288, 128, 128] + - [35, 9494.0] + - - [4096, 512, 1, 32, 4096, 4096, 32, 32] + - [46, 3875.0] + - - [2048, 1024, 1, 1664, 2048, 2048, 1664, 1664] + - [5, 10524.0] + - - [4096, 512, 1, 1408, 4096, 4096, 1408, 1408] + - [49, 10466.0] + - - [4096, 1024, 1, 1280, 4096, 4096, 1280, 1280] + - [38, 11526.0] + - - [2048, 1024, 1, 640, 2048, 2048, 640, 640] + - [38, 10093.0] + - - [4096, 1024, 1, 13312, 4096, 4096, 13312, 13312] + - [42, 8808.0] + - - [2048, 1024, 1, 13312, 2048, 2048, 13312, 13312] + - [41, 7076.0] + - - [2048, 1024, 1, 3584, 2048, 2048, 3584, 3584] + - [16, 10412.0] + - - [4096, 1024, 1, 1920, 4096, 4096, 1920, 1920] + - [38, 11332.0] + - - [4096, 1024, 1, 12288, 4096, 4096, 12288, 12288] + - [17, 7157.0] + - - [4096, 1024, 1, 8320, 4096, 4096, 8320, 8320] + - [58, 9514.0] + - - [4096, 1024, 1, 15360, 4096, 4096, 15360, 15360] + - [18, 7399.0] + - - [4096, 512, 1, 3072, 4096, 4096, 3072, 3072] + - [16, 10344.0] + - - [4096, 512, 1, 13312, 4096, 4096, 13312, 13312] + - [17, 5527.0] + - - [4096, 1024, 1, 3840, 4096, 4096, 3840, 3840] + - [38, 8012.0] + - - [2048, 1024, 1, 3200, 2048, 2048, 3200, 3200] + - [27, 10649.0] + - - [4096, 512, 1, 3840, 4096, 4096, 3840, 3840] + - [27, 10474.0] + - - [4096, 512, 1, 5632, 4096, 4096, 5632, 5632] + - [16, 6436.0] + - - [4096, 512, 1, 64, 4096, 4096, 64, 64] + - [55, 5483.0] + - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] + - [38, 9538.0] + - - [4096, 512, 1, 8192, 4096, 4096, 8192, 8192] + - [17, 4620.0] + - - [4096, 512, 1, 2304, 4096, 4096, 2304, 2304] + - [58, 10191.0] + - - [4096, 512, 1, 2816, 4096, 4096, 2816, 2816] + - [16, 10329.0] + - - [2048, 1024, 1, 7680, 2048, 2048, 7680, 7680] + - [58, 8043.0] + - - [4096, 512, 1, 1920, 4096, 4096, 1920, 1920] + - [34, 10206.0] + - - [4096, 1024, 1, 32, 4096, 4096, 32, 32] + - [11, 5815.0] + - - [4096, 512, 1, 16640, 4096, 4096, 16640, 16640] + - [58, 8039.0] + - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] + - [16, 9511.0] + - - [4096, 512, 1, 1792, 4096, 4096, 1792, 1792] + - [58, 10410.0] + - - [4096, 1024, 1, 8192, 4096, 4096, 8192, 8192] + - [16, 5794.0] + - - [2048, 1024, 1, 4160, 2048, 2048, 4160, 4160] + - [23, 10822.0] + - - [4096, 512, 1, 10240, 4096, 4096, 10240, 10240] + - [61, 5869.0] + - - [4096, 512, 1, 512, 4096, 4096, 512, 512] + - [13, 8772.0] + - - [2048, 1024, 1, 6656, 2048, 2048, 6656, 6656] + - [31, 8551.0] + - - [2048, 1024, 1, 14336, 2048, 2048, 14336, 14336] + - [20, 5778.0] + - - [4096, 512, 1, 11264, 4096, 4096, 11264, 11264] + - [7, 6557.0] + - - [4096, 512, 1, 128, 4096, 4096, 128, 128] + - [33, 6862.0] + - - [4096, 512, 1, 768, 4096, 4096, 768, 768] + - [38, 10141.0] + - - [4096, 1024, 1, 11264, 4096, 4096, 11264, 11264] + - [17, 6884.0] + - - [4096, 1024, 1, 16640, 4096, 4096, 16640, 16640] + - [49, 11269.0] + - - [2048, 1024, 1, 5632, 2048, 2048, 5632, 5632] + - [16, 10341.0] + - - [4096, 512, 1, 12288, 4096, 4096, 12288, 12288] + - [17, 5137.0] + - - [4096, 1024, 1, 5632, 4096, 4096, 5632, 5632] + - [16, 7699.0] + - - [2048, 1024, 1, 10240, 2048, 2048, 10240, 10240] + - [58, 9344.0] + - - [4096, 1024, 1, 640, 4096, 4096, 640, 640] + - [34, 10662.0] + - - [2048, 1024, 1, 12288, 2048, 2048, 12288, 12288] + - [16, 5737.0] + - - [4096, 1024, 1, 10240, 4096, 4096, 10240, 10240] + - [17, 6746.0] + - - [2048, 1024, 1, 4608, 2048, 2048, 4608, 4608] + - [16, 10465.0] + - - [4096, 512, 1, 3584, 4096, 4096, 3584, 3584] + - [16, 10437.0] + - - [4096, 1024, 1, 4608, 4096, 4096, 4608, 4608] + - [16, 7756.0] + - - [4096, 1024, 1, 3328, 4096, 4096, 3328, 3328] + - [38, 11029.0] + - - [2048, 1024, 1, 9216, 2048, 2048, 9216, 9216] + - [15, 5912.0] + - - [2048, 1024, 1, 2304, 2048, 2048, 2304, 2304] + - [38, 10172.0] + - - [4096, 512, 1, 6144, 4096, 4096, 6144, 6144] + - [16, 5388.0] + - - [4096, 512, 1, 15360, 4096, 4096, 15360, 15360] + - [17, 5872.0] + - - [4096, 1024, 1, 7168, 4096, 4096, 7168, 7168] + - [17, 6773.0] + - - [4096, 1024, 1, 9216, 4096, 4096, 9216, 9216] + - [42, 8168.0] + - - [4096, 1024, 1, 7680, 4096, 4096, 7680, 7680] + - [18, 6708.0] + - - [2048, 1024, 1, 8192, 2048, 2048, 8192, 8192] + - [41, 5269.0] + - - [4096, 1024, 1, 64, 4096, 4096, 64, 64] + - [33, 6704.0] + - - [2048, 1024, 1, 1280, 2048, 2048, 1280, 1280] + - [16, 10280.0] + - - [2048, 1024, 1, 3328, 2048, 2048, 3328, 3328] + - [16, 10616.0] + - - [4096, 512, 1, 14336, 4096, 4096, 14336, 14336] + - [17, 5660.0] + - - [4096, 512, 1, 8320, 4096, 4096, 8320, 8320] + - [27, 9775.0] + - - [4096, 1024, 1, 6656, 4096, 4096, 6656, 6656] + - [50, 7075.0] + - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] + - [35, 8117.0] + - - [4096, 512, 1, 1024, 4096, 4096, 1024, 1024] + - [58, 9933.0] + - - [4096, 1024, 1, 1536, 4096, 4096, 1536, 1536] + - [16, 11493.0] + - - [2048, 1024, 1, 32, 2048, 2048, 32, 32] + - [59, 3521.0] + - - [4096, 512, 1, 640, 4096, 4096, 640, 640] + - [54, 10049.0] + - - [4096, 512, 1, 16384, 4096, 4096, 16384, 16384] + - [18, 5675.0] + - - [4096, 1024, 1, 512, 4096, 4096, 512, 512] + - [16, 9914.0] + - - [2048, 1024, 1, 1152, 2048, 2048, 1152, 1152] + - [27, 10293.0] + - - [4096, 1024, 1, 2080, 4096, 4096, 2080, 2080] + - [23, 11927.0] + - - [4096, 1024, 1, 768, 4096, 4096, 768, 768] + - [58, 10953.0] + - - [4096, 1024, 1, 2560, 4096, 4096, 2560, 2560] + - [16, 11529.0] + - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] + - [33, 6848.0] + - - [4096, 1024, 1, 16384, 4096, 4096, 16384, 16384] + - [18, 7192.0] + - - [4096, 512, 1, 6656, 4096, 4096, 6656, 6656] + - [16, 9562.0] + - - [2048, 1024, 1, 128, 2048, 2048, 128, 128] + - [53, 6799.0] + - - [2048, 1024, 1, 2080, 2048, 2048, 2080, 2080] + - [12, 10877.0] + - - [2048, 1024, 1, 16640, 2048, 2048, 16640, 16640] + - [28, 8876.0] + - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] + - [16, 10376.0] + - - [4096, 1024, 1, 1408, 4096, 4096, 1408, 1408] + - [5, 11450.0] + - - [4096, 1024, 1, 2048, 4096, 4096, 2048, 2048] + - [16, 11303.0] + - - [2048, 1024, 1, 2560, 2048, 2048, 2560, 2560] + - [16, 10176.0] + - - [4096, 1024, 1, 128, 4096, 4096, 128, 128] + - [17, 9180.0] + - - [4096, 1024, 1, 14336, 4096, 4096, 14336, 14336] + - [18, 7358.0] + - - [4096, 512, 1, 9216, 4096, 4096, 9216, 9216] + - [17, 5072.0] + - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 10010.0] + - - [4096, 512, 1, 1536, 4096, 4096, 1536, 1536] + - [58, 9867.0] + - - [2048, 1024, 1, 16384, 2048, 2048, 16384, 16384] + - [20, 5872.0] + - - [4096, 1024, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10765.0] + - - [4096, 1024, 1, 1664, 4096, 4096, 1664, 1664] + - [38, 11505.0] + - - [4096, 512, 1, 384, 4096, 4096, 384, 384] + - [38, 9677.0] + - - [4096, 512, 1, 3328, 4096, 4096, 3328, 3328] + - [16, 10409.0] + - - [4096, 1024, 1, 256, 4096, 4096, 256, 256] + - [27, 10201.0] + - - [2048, 1024, 1, 7168, 2048, 2048, 7168, 7168] + - [38, 7012.0] + - - [2048, 1024, 1, 1536, 2048, 2048, 1536, 1536] + - [16, 9874.0] + - - [4096, 512, 1, 7168, 4096, 4096, 7168, 7168] + - [39, 8332.0] + - - [4096, 1024, 1, 896, 4096, 4096, 896, 896] + - [34, 11487.0] + - - [4096, 1024, 1, 4096, 4096, 4096, 4096, 4096] + - [39, 8085.0] + - - [2048, 1024, 1, 6144, 2048, 2048, 6144, 6144] + - [42, 8648.0] + - - [4096, 512, 1, 4160, 4096, 4096, 4160, 4160] + - [38, 10116.0] + - - [4096, 512, 1, 2080, 4096, 4096, 2080, 2080] + - [23, 10855.0] + - - [4096, 1024, 1, 5120, 4096, 4096, 5120, 5120] + - [15, 7271.0] + - - [2048, 1024, 1, 1920, 2048, 2048, 1920, 1920] + - [27, 10098.0] + - - [2048, 1024, 1, 15360, 2048, 2048, 15360, 15360] + - [38, 8845.0] + - - [4096, 1024, 1, 2816, 4096, 4096, 2816, 2816] + - [16, 11404.0] + - - [4096, 512, 1, 256, 4096, 4096, 256, 256] + - [16, 9097.0] + - - [2048, 1024, 1, 5120, 2048, 2048, 5120, 5120] + - [16, 10538.0] + - - [2048, 1024, 1, 4096, 2048, 2048, 4096, 4096] + - [16, 9399.0] + - - [4096, 512, 1, 4608, 4096, 4096, 4608, 4608] + - [42, 7871.0] + - - [4096, 512, 1, 1664, 4096, 4096, 1664, 1664] + - [54, 10373.0] + - - [2048, 1024, 1, 896, 2048, 2048, 896, 896] + - [38, 10185.0] + - - [4096, 1024, 1, 4160, 4096, 4096, 4160, 4160] + - [38, 11642.0] + - - [2048, 1024, 1, 11264, 2048, 2048, 11264, 11264] + - [8, 6333.0] + - - [2048, 1024, 1, 384, 2048, 2048, 384, 384] + - [53, 8631.0] + - - [2048, 1024, 1, 3840, 2048, 2048, 3840, 3840] + - [16, 10607.0] + - - [4096, 512, 1, 1280, 4096, 4096, 1280, 1280] + - [58, 10341.0] + - - [4096, 1024, 1, 1152, 4096, 4096, 1152, 1152] + - [27, 11497.0] + - - [2048, 1024, 1, 1408, 2048, 2048, 1408, 1408] + - [38, 10356.0] + - - [4096, 512, 1, 896, 4096, 4096, 896, 896] + - [54, 10197.0] + - - [4096, 1024, 1, 3072, 4096, 4096, 3072, 3072] + - [16, 11070.0] + - - [2048, 1024, 1, 2816, 2048, 2048, 2816, 2816] + - [16, 10262.0] + - - [4096, 1024, 1, 1792, 4096, 4096, 1792, 1792] + - [58, 11396.0] + - - [4096, 512, 1, 1152, 4096, 4096, 1152, 1152] + - [34, 10091.0] + - - [4096, 512, 1, 7680, 4096, 4096, 7680, 7680] + - [52, 6396.0] + - - [4096, 1024, 1, 384, 4096, 4096, 384, 384] + - [34, 10004.0] + - - [2048, 1024, 1, 1792, 2048, 2048, 1792, 1792] + - [38, 10409.0] + - - [4096, 1024, 1, 3584, 4096, 4096, 3584, 3584] + - [16, 9500.0] + - - [2048, 1024, 1, 768, 2048, 2048, 768, 768] + - [58, 9861.0] + - - [2048, 1024, 1, 8320, 2048, 2048, 8320, 8320] + - [16, 10681.0] + - - [4096, 512, 1, 2048, 4096, 4096, 2048, 2048] + - [16, 9947.0] + - - [4096, 512, 1, 2560, 4096, 4096, 2560, 2560] + - [58, 10300.0] + - - [4096, 1024, 1, 2304, 4096, 4096, 2304, 2304] + - [16, 11553.0] + - - [4096, 512, 1, 5120, 4096, 4096, 5120, 5120] + - [17, 6024.0] + - - [4096, 1024, 1, 6144, 4096, 4096, 6144, 6144] + - [16, 6752.0] + - - [1024, 3392, 1, 4096, 1024, 1024, 4096, 4096] + - [59, 9315.0] + - - [1024, 3301, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8439.0] + - - [1024, 3443, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8651.0] + - - [132, 134, 480, 64, 132, 132, 64, 64] + - [44, 3813.0] + - - [162, 162, 400, 64, 162, 162, 64, 64] + - [0, 5403.0] + - - [4096, 3548, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10935.0] + - - [4096, 2977, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10941.0] + - - [132, 135, 480, 64, 132, 132, 64, 64] + - [44, 3654.0] + - - [1024, 2985, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 9222.0] + - - [33708, 3681, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10533.0] + - - [4096, 3443, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10447.0] + - - [1024, 3400, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8566.0] + - - [4096, 3995, 1, 1024, 4096, 4096, 1024, 1024] + - [51, 9497.0] + - - [4096, 3190, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11199.0] + - - [4096, 3594, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10259.0] + - - [159, 162, 400, 64, 159, 159, 64, 64] + - [53, 5062.0] + - - [1024, 3565, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8155.0] + - - [4096, 3422, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10886.0] + - - [1024, 3214, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8320.0] + - - [33708, 3584, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10825.0] + - - [33708, 3640, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10544.0] + - - [4096, 3263, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10960.0] + - - [4096, 3296, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11100.0] + - - [1024, 3557, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8307.0] + - - [4096, 3463, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10563.0] + - - [4096, 3528, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10707.0] + - - [4096, 3226, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10949.0] + - - [4096, 3439, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10878.0] + - - [1024, 3523, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8460.0] + - - [1024, 3098, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8741.0] + - - [4096, 3121, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10978.0] + - - [33708, 3894, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10660.0] + - - [1024, 3548, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8450.0] + - - [1024, 3451, 1, 4096, 1024, 1024, 4096, 4096] + - [59, 8527.0] + - - [4096, 3353, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10715.0] + - - [4096, 3402, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10748.0] + - - [4096, 3939, 1, 1024, 4096, 4096, 1024, 1024] + - [21, 8927.0] + - - [133, 133, 480, 64, 133, 133, 64, 64] + - [53, 3351.0] + - - [1024, 3559, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8238.0] + - - [1024, 2977, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9787.0] + - - [1024, 3478, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8210.0] + - - [134, 134, 480, 64, 134, 134, 64, 64] + - [44, 3503.0] + - - [1024, 3368, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8432.0] + - - [4096, 4012, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8620.0] + - - [4096, 3486, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10667.0] + - - [1024, 3479, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8409.0] + - - [1024, 3505, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8336.0] + - - [4096, 3381, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10802.0] + - - [4096, 3430, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10938.0] + - - [1024, 3554, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8429.0] + - - [4096, 3271, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11020.0] + - - [1024, 3063, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8994.0] + - - [1024, 3209, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 7923.0] + - - [4096, 3503, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10827.0] + - - [4096, 3344, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10693.0] + - - [1024, 3147, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8620.0] + - - [1024, 3322, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8316.0] + - - [1024, 3341, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 9159.0] + - - [1024, 3516, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8323.0] + - - [1024, 3454, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8765.0] + - - [4096, 3969, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8958.0] + - - [4096, 3466, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10439.0] + - - [1024, 3999, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10418.0] + - - [1024, 4032, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10418.0] + - - [1024, 3403, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8566.0] + - - [4096, 3361, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10207.0] + - - [1024, 3527, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8361.0] + - - [1024, 3822, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8177.0] + - - [4096, 3315, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11086.0] + - - [232, 232, 272, 64, 232, 232, 64, 64] + - [48, 6102.0] + - - [1024, 3336, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8488.0] + - - [228, 232, 272, 64, 228, 228, 64, 64] + - [57, 5893.0] + - - [4096, 3547, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10621.0] + - - [4096, 3340, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10396.0] + - - [1024, 3906, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10182.0] + - - [1024, 3295, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8371.0] + - - [4096, 3294, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11131.0] + - - [33708, 3968, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10797.0] + - - [1024, 3473, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8170.0] + - - [1024, 3072, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 9108.0] + - - [4096, 3189, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11313.0] + - - [4096, 3494, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10717.0] + - - [1024, 3522, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8262.0] + - - [33708, 3944, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10684.0] + - - [135, 135, 480, 64, 135, 135, 64, 64] + - [22, 3915.0] + - - [4096, 3421, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10937.0] + - - [4096, 3311, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11024.0] + - - [1024, 3990, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10354.0] + - - [1024, 3290, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8553.0] + - - [4096, 3565, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10646.0] + - - [1024, 3484, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8297.0] + - - [4096, 3384, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10791.0] + - - [1024, 3422, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8534.0] + - - [4096, 3681, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10156.0] + - - [1024, 3584, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 10360.0] + - - [4096, 4050, 1, 1024, 4096, 4096, 1024, 1024] + - [58, 10167.0] + - - [1024, 3996, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 7713.0] + - - [4096, 3169, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11167.0] + - - [4096, 3538, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10458.0] + - - [1024, 3495, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8292.0] + - - [4096, 3401, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10849.0] + - - [1024, 3560, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8334.0] + - - [133, 135, 480, 64, 133, 133, 64, 64] + - [44, 3414.0] + - - [1024, 3263, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8363.0] + - - [1024, 3870, 1, 4096, 1024, 1024, 4096, 4096] + - [63, 8031.0] + - - [4096, 3555, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10649.0] + - - [4096, 3412, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10879.0] + - - [1024, 3296, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8256.0] + - - [1024, 3379, 1, 4096, 1024, 1024, 4096, 4096] + - [20, 9404.0] + - - [4096, 3302, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11083.0] + - - [1024, 3490, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8183.0] + - - [1024, 3428, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8622.0] + - - [1024, 3976, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 7611.0] + - - [4096, 3485, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10720.0] + - - [4096, 3534, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10614.0] + - - [1024, 3064, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8918.0] + - - [4096, 3216, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10854.0] + - - [1024, 3450, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8805.0] + - - [1024, 3533, 1, 4096, 1024, 1024, 4096, 4096] + - [17, 8781.0] + - - [1024, 4030, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10471.0] + - - [1024, 3311, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8475.0] + - - [1024, 3468, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7990.0] + - - [4096, 3359, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10414.0] + - - [4096, 3392, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10667.0] + - - [1024, 3925, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10268.0] + - - [4096, 3233, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10866.0] + - - [4096, 3956, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8954.0] + - - [1024, 3463, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8181.0] + - - [1024, 3126, 1, 4096, 1024, 1024, 4096, 4096] + - [43, 8867.0] + - - [1024, 3363, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8322.0] + - - [4096, 3465, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10675.0] + - - [33708, 3996, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10785.0] + - - [1024, 3231, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8135.0] + - - [33708, 3978, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10670.0] + - - [4096, 3476, 1, 1024, 4096, 4096, 1024, 1024] + - [17, 10471.0] + - - [4096, 3339, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10723.0] + - - [4096, 3452, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10867.0] + - - [1024, 3396, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8431.0] + - - [4096, 3293, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11110.0] + - - [1024, 3432, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8382.0] + - - [4096, 3493, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10774.0] + - - [4096, 3350, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10661.0] + - - [1024, 3079, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8715.0] + - - [1024, 3101, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8607.0] + - - [33708, 3939, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10700.0] + - - [4096, 3256, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10955.0] + - - [1024, 3439, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8526.0] + - - [1024, 3510, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8364.0] + - - [4096, 3900, 1, 1024, 4096, 4096, 1024, 1024] + - [12, 9887.0] + - - [1024, 3470, 1, 4096, 1024, 1024, 4096, 4096] + - [63, 8194.0] + - - [4096, 3456, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11010.0] + - - [4096, 3014, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11419.0] + - - [4096, 3367, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10753.0] + - - [4096, 3432, 1, 1024, 4096, 4096, 1024, 1024] + - [29, 10063.0] + - - [33708, 4026, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10893.0] + - - [4096, 3273, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11045.0] + - - [4096, 3130, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10909.0] + - - [1024, 3496, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8296.0] + - - [1024, 3995, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 8729.0] + - - [1024, 3939, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8025.0] + - - [1024, 3121, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8655.0] + - - [1024, 3232, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 9153.0] + - - [4096, 3147, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11019.0] + - - [4096, 3516, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10766.0] + - - [1024, 3969, 1, 1024, 1024, 1024, 1024, 1024] + - [58, 10458.0] + - - [1024, 3364, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8586.0] + - - [4096, 3411, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10820.0] + - - [147, 147, 432, 64, 147, 147, 64, 64] + - [53, 4577.0] + - - [4096, 3301, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11005.0] + - - [1024, 3513, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8417.0] + - - [1024, 3469, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8379.0] + - - [1024, 3095, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8684.0] + - - [4096, 3533, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10578.0] + - - [4096, 3390, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10882.0] + - - [4096, 3582, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10451.0] + - - [1024, 3956, 1, 1024, 1024, 1024, 1024, 1024] + - [58, 10392.0] + - - [4096, 3585, 1, 1024, 4096, 4096, 1024, 1024] + - [17, 10300.0] + - - [4096, 3231, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10961.0] + - - [1024, 3205, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8331.0] + - - [4096, 3496, 1, 1024, 4096, 4096, 1024, 1024] + - [29, 10781.0] + - - [1024, 3143, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8630.0] + - - [1024, 3318, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8543.0] + - - [1024, 3353, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8500.0] + - - [1024, 3464, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8434.0] + - - [4096, 2736, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11049.0] + - - [1024, 3402, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8514.0] + - - [4096, 3138, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11104.0] + - - [1024, 3860, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 8166.0] + - - [148, 148, 432, 64, 148, 148, 64, 64] + - [22, 3958.0] + - - [1024, 3539, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8431.0] + - - [4096, 3211, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10885.0] + - - [1024, 3332, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8338.0] + - - [1024, 3466, 1, 4096, 1024, 1024, 4096, 4096] + - [17, 8448.0] + - - [4096, 3475, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10418.0] + - - [4096, 3524, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10772.0] + - - [4096, 2985, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11015.0] + - - [4096, 3222, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10939.0] + - - [4096, 3451, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10991.0] + - - [1024, 3181, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8471.0] + - - [1024, 3640, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8361.0] + - - [1024, 3375, 1, 4096, 1024, 1024, 4096, 4096] + - [43, 8667.0] + - - [1024, 3550, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8541.0] + - - [1024, 4020, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10433.0] + - - [4096, 3349, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10645.0] + - - [4096, 3398, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10790.0] + - - [33708, 3976, 1, 1024, 33708, 33708, 1024, 1024] + - [40, 10604.0] + - - [1024, 2917, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9719.0] + - - [33708, 3910, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10613.0] + - - [4096, 3860, 1, 1024, 4096, 4096, 1024, 1024] + - [50, 9687.0] + - - [4096, 3304, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11029.0] + - - [1024, 3286, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8473.0] + - - [1024, 3460, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8260.0] + - - [1024, 4026, 1, 4096, 1024, 1024, 4096, 4096] + - [21, 7936.0] + - - [4096, 3471, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10517.0] + - - [193, 193, 320, 64, 193, 193, 64, 64] + - [50, 3769.0] + - - [1024, 3894, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10405.0] + - - [1024, 3506, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8313.0] + - - [1024, 4000, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10349.0] + - - [1024, 3900, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 7904.0] + - - [1024, 3445, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8775.0] + - - [4096, 3442, 1, 1024, 4096, 4096, 1024, 1024] + - [49, 9909.0] + - - [1024, 3358, 1, 4096, 1024, 1024, 4096, 4096] + - [21, 8667.0] + - - [1024, 3211, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8181.0] + - - [4096, 3515, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10737.0] + - - [1024, 3564, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8780.0] + - - [4096, 3057, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11283.0] + - - [1024, 3343, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8561.0] + - - [4096, 3262, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11078.0] + - - [1024, 3518, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8346.0] + - - [33708, 3876, 1, 1024, 33708, 33708, 1024, 1024] + - [51, 11175.0] + - - [4096, 3462, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10589.0] + - - [1024, 3265, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8361.0] + - - [4096, 3389, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10856.0] + - - [4096, 3438, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10801.0] + - - [1024, 3955, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10405.0] + - - [1024, 3545, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8288.0] + - - [1024, 3144, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8696.0] + - - [1024, 3417, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8593.0] + - - [4096, 3543, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10621.0] + - - [4096, 3352, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11383.0] + - - [33708, 3975, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10880.0] + - - [148, 147, 432, 64, 148, 148, 64, 64] + - [22, 4586.0] + - - [4096, 3137, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11191.0] + - - [4096, 3506, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10562.0] + - - [1024, 3975, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 9933.0] + - - [1024, 3859, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7989.0] + - - [4096, 3369, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11021.0] + - - [1024, 3434, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8513.0] + - - [1024, 3292, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8422.0] + - - [4096, 3523, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10688.0] + - - [4096, 3380, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10865.0] + - - [1024, 3408, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8736.0] + - - [4096, 3221, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10876.0] + - - [4096, 3270, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11000.0] + - - [143, 143, 432, 64, 143, 143, 64, 64] + - [33, 4158.0] + - - [1024, 3303, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8464.0] + - - [4096, 3502, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10710.0] + - - [1024, 3222, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 8830.0] + - - [4096, 2505, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11063.0] + - - [4096, 3397, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10820.0] + - - [4096, 3562, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11696.0] + - - [4096, 3095, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10909.0] + - - [1024, 3226, 1, 4096, 1024, 1024, 4096, 4096] + - [39, 8935.0] + - - [177, 177, 352, 64, 177, 177, 64, 64] + - [33, 4157.0] + - - [4096, 3360, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10718.0] + - - [1024, 3942, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10225.0] + - - [1024, 3298, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8518.0] + - - [1024, 3381, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8516.0] + - - [4096, 3314, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11024.0] + - - [1024, 3492, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8177.0] + - - [1024, 3430, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8645.0] + - - [4096, 3977, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 8369.0] + - - [4096, 3546, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10693.0] + - - [4096, 3640, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10254.0] + - - [4096, 3441, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10889.0] + - - [33708, 4059, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10954.0] + - - [1024, 3978, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10321.0] + - - [1024, 3376, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8456.0] + - - [1024, 3482, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8198.0] + - - [1024, 3563, 1, 4096, 1024, 1024, 4096, 4096] + - [17, 8695.0] + - - [4096, 4020, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8752.0] + - - [1024, 3271, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8418.0] + - - [1024, 3291, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8399.0] + - - [1024, 3431, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 10100.0] + - - [1024, 3481, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8325.0] + - - [4096, 3461, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10719.0] + - - [1024, 3574, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8592.0] + - - [1024, 4059, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10395.0] + - - [1024, 3421, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8725.0] + - - [4096, 3224, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10991.0] + - - [4096, 3437, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10764.0] + - - [4096, 3168, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11660.0] + - - [33708, 3990, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 11429.0] + - - [1024, 3349, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8323.0] + - - [4096, 3335, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10686.0] + - - [4096, 3400, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10863.0] + - - [160, 159, 400, 64, 160, 160, 64, 64] + - [53, 4126.0] + - - [1024, 3398, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8499.0] + - - [1024, 3780, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8476.0] + - - [4096, 3098, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11023.0] + - - [1024, 4012, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9063.0] + - - [4096, 3505, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10710.0] + - - [4096, 3554, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10531.0] + - - [4096, 3063, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11231.0] + - - [1024, 3503, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8258.0] + - - [1024, 3166, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 9206.0] + - - [1024, 3425, 1, 4096, 1024, 1024, 4096, 4096] + - [63, 8794.0] + - - [1024, 3344, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8379.0] + - - [4096, 3484, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10814.0] + - - [1024, 3681, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10891.0] + - - [1024, 4050, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10498.0] + - - [4096, 3379, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10781.0] + - - [4096, 3428, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10927.0] + - - [1024, 3304, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8491.0] + - - [1024, 3387, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8531.0] + - - [4096, 3126, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11075.0] + - - [1024, 3498, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8333.0] + - - [1024, 3436, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8740.0] + - - [4096, 3501, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10779.0] + - - [4096, 3358, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10833.0] + - - [4096, 3232, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10871.0] + - - [1024, 3585, 1, 4096, 1024, 1024, 4096, 4096] + - [16, 8563.0] + - - [4096, 3143, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11113.0] + - - [4096, 3464, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10639.0] + - - [1024, 3366, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8485.0] + - - [4096, 3375, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10827.0] + - - [4096, 2917, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11207.0] + - - [4096, 4026, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8613.0] + - - [1024, 3277, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8426.0] + - - [1024, 3103, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8756.0] + - - [33708, 3995, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10914.0] + - - [1024, 3297, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8412.0] + - - [4096, 3545, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10391.0] + - - [1024, 3399, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 9641.0] + - - [33708, 3796, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10910.0] + - - [4096, 3292, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11088.0] + - - [33708, 3859, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10460.0] + - - [4096, 3566, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10699.0] + - - [4096, 3894, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8756.0] + - - [4096, 3492, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10672.0] + - - [1024, 3977, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10461.0] + - - [1024, 3272, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8361.0] + - - [135, 134, 480, 64, 135, 135, 64, 64] + - [53, 3542.0] + - - [1024, 3355, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8520.0] + - - [4096, 3419, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10949.0] + - - [1024, 3404, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8679.0] + - - [4096, 3999, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11492.0] + - - [4096, 3166, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11243.0] + - - [33708, 3840, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 11933.0] + - - [4096, 4032, 1, 1024, 4096, 4096, 1024, 1024] + - [29, 9606.0] + - - [1024, 3573, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8418.0] + - - [4096, 3366, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10778.0] + - - [1024, 3541, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8298.0] + - - [4096, 3207, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10939.0] + - - [4096, 3272, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11024.0] + - - [1024, 3334, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8452.0] + - - [228, 228, 272, 64, 228, 228, 64, 64] + - [57, 5968.0] + - - [4096, 3183, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11234.0] + - - [4096, 3536, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10716.0] + - - [1024, 4005, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10335.0] + - - [1024, 3245, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8391.0] + - - [4096, 3447, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10761.0] + - - [1024, 3183, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8667.0] + - - [1024, 3361, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8525.0] + - - [33708, 3870, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10613.0] + - - [1024, 3321, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8447.0] + - - [1024, 3486, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8285.0] + - - [4096, 4005, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8519.0] + - - [4096, 3410, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10852.0] + - - [1024, 3944, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10229.0] + - - [4096, 3300, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11105.0] + - - [4096, 3579, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 9801.0] + - - [4096, 3483, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10654.0] + - - [4096, 3532, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10704.0] + - - [1024, 3140, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8564.0] + - - [1024, 3372, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 9362.0] + - - [1024, 3224, 1, 4096, 1024, 1024, 4096, 4096] + - [20, 8536.0] + - - [4096, 3230, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10850.0] + - - [4096, 3427, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10826.0] + - - [1024, 3796, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10819.0] + - - [143, 148, 432, 64, 143, 143, 64, 64] + - [53, 4430.0] + - - [1024, 3616, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8568.0] + - - [1024, 3315, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8358.0] + - - [1024, 3476, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8250.0] + - - [1024, 3509, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8198.0] + - - [4096, 3357, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10788.0] + - - [4096, 3406, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10843.0] + - - [1024, 3558, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8306.0] + - - [4096, 3593, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 9835.0] + - - [4096, 3247, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10972.0] + - - [4096, 3088, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10924.0] + - - [1024, 3213, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8369.0] + - - [4096, 3511, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11295.0] + - - [1024, 3365, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8568.0] + - - [1024, 3504, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8390.0] + - - [1024, 3442, 1, 4096, 1024, 1024, 4096, 4096] + - [21, 8648.0] + - - [4096, 3474, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10715.0] + - - [4096, 2984, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10996.0] + - - [1024, 3876, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8045.0] + - - [4096, 3337, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10666.0] + - - [4096, 3450, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10187.0] + - - [1024, 3547, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8487.0] + - - [4096, 3291, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11137.0] + - - [1024, 3340, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8384.0] + - - [4096, 3491, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10762.0] + - - [4096, 3348, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10676.0] + - - [4096, 3906, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9518.0] + - - [1024, 3477, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8358.0] + - - [1024, 3397, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8523.0] + - - [4096, 3165, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11088.0] + - - [4096, 3470, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10629.0] + - - [1024, 3526, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8459.0] + - - [4096, 3365, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10847.0] + - - [4096, 3319, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11611.0] + - - [1024, 3401, 1, 4096, 1024, 1024, 4096, 4096] + - [59, 8846.0] + - - [1024, 3294, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8351.0] + - - [159, 159, 400, 64, 159, 159, 64, 64] + - [33, 3782.0] + - - [1024, 3472, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8293.0] + - - [4096, 3328, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11077.0] + - - [1024, 3861, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10108.0] + - - [1024, 3910, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10717.0] + - - [1024, 3410, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8426.0] + - - [1024, 3395, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8571.0] + - - [4096, 3282, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11112.0] + - - [1024, 3751, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11025.0] + - - [4096, 3145, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11249.0] + - - [4096, 3514, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10749.0] + - - [4096, 3944, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9019.0] + - - [1024, 3515, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8281.0] + - - [4096, 3409, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10878.0] + - - [4096, 3564, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10607.0] + - - [4096, 3299, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10986.0] + - - [1024, 3057, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9083.0] + - - [4096, 3531, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10717.0] + - - [4096, 3388, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10817.0] + - - [1024, 3189, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8603.0] + - - [1024, 3300, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8370.0] + - - [1024, 3720, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8523.0] + - - [1024, 3383, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8462.0] + - - [1024, 3494, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8295.0] + - - [1024, 3448, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8512.0] + - - [4096, 3542, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10697.0] + - - [1024, 3488, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8330.0] + - - [4096, 3405, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10854.0] + - - [1024, 3262, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8216.0] + - - [33708, 4005, 1, 1024, 33708, 33708, 1024, 1024] + - [40, 11931.0] + - - [1024, 3594, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8481.0] + - - [4096, 3103, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11031.0] + - - [4096, 3136, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11079.0] + - - [1024, 3378, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9427.0] + - - [4096, 3559, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10711.0] + - - [4096, 3368, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10753.0] + - - [4096, 3209, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10868.0] + - - [4096, 3322, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10983.0] + - - [1024, 3483, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8190.0] + - - [4096, 3473, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10668.0] + - - [4096, 3522, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10643.0] + - - [1024, 3532, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8414.0] + - - [4096, 3449, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10870.0] + - - [1024, 3351, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8286.0] + - - [1024, 3462, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8341.0] + - - [4096, 3396, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10843.0] + - - [132, 132, 480, 64, 132, 132, 64, 64] + - [44, 3756.0] + - - [1024, 3416, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8531.0] + - - [4096, 3469, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10733.0] + - - [1024, 3582, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 10323.0] + - - [1024, 3230, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8426.0] + - - [1024, 3489, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8343.0] + - - [1024, 3427, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8629.0] + - - [1024, 3346, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8410.0] + - - [33708, 3977, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10564.0] + - - [4096, 3796, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9231.0] + - - [4096, 3176, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11226.0] + - - [4096, 3990, 1, 1024, 4096, 4096, 1024, 1024] + - [42, 8552.0] + - - [1024, 3257, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8396.0] + - - [4096, 3343, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10706.0] + - - [4096, 3440, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10700.0] + - - [33708, 4030, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10846.0] + - - [1024, 3190, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8404.0] + - - [1024, 3389, 1, 4096, 1024, 1024, 4096, 4096] + - [17, 8511.0] + - - [1024, 3500, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8294.0] + - - [1024, 3471, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8136.0] + - - [1024, 3438, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8467.0] + - - [4096, 3513, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10758.0] + - - [1024, 3562, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8448.0] + - - [4096, 3616, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10326.0] + - - [4096, 3955, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9179.0] + - - [1024, 3441, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9584.0] + - - [1024, 3236, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8320.0] + - - [1024, 3524, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8329.0] + - - [4096, 3460, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10695.0] + - - [1024, 3384, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8339.0] + - - [4096, 3387, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10854.0] + - - [4096, 3436, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10764.0] + - - [4096, 3277, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11041.0] + - - [1024, 3457, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8097.0] + - - [1024, 3999, 1, 4096, 1024, 1024, 4096, 4096] + - [49, 7660.0] + - - [1024, 4032, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8163.0] + - - [4096, 3541, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10628.0] + - - [4096, 3334, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10724.0] + - - [1024, 3393, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8509.0] + - - [1024, 3411, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8613.0] + - - [1024, 3822, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10813.0] + - - [1024, 3593, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8292.0] + - - [33708, 3822, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10968.0] + - - [4096, 3504, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10705.0] + - - [1024, 3163, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8686.0] + - - [1024, 3357, 1, 4096, 1024, 1024, 4096, 4096] + - [63, 8569.0] + - - [1024, 3906, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7855.0] + - - [4096, 3415, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11162.0] + - - [1024, 3406, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8497.0] + - - [4096, 3321, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10934.0] + - - [4096, 3584, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10587.0] + - - [1024, 2736, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9847.0] + - - [1024, 3110, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8683.0] + - - [33708, 3999, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10721.0] + - - [1024, 3093, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8723.0] + - - [4096, 3378, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10845.0] + - - [1024, 3543, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8396.0] + - - [33708, 3925, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10655.0] + - - [1024, 3352, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8453.0] + - - [4096, 3780, 1, 1024, 4096, 4096, 1024, 1024] + - [17, 10361.0] + - - [1024, 3990, 1, 4096, 1024, 1024, 4096, 4096] + - [32, 8586.0] + - - [4096, 3500, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10751.0] + - - [4096, 3996, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8797.0] + - - [1024, 3247, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8286.0] + - - [4096, 3395, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10442.0] + - - [1024, 3169, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8405.0] + - - [1024, 3088, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8737.0] + - - [1024, 3584, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8446.0] + - - [4096, 3093, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10910.0] + - - [1024, 3538, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8359.0] + - - [1024, 3996, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10016.0] + - - [1024, 3581, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8629.0] + - - [4096, 3374, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10720.0] + - - [33708, 3751, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10595.0] + - - [4096, 3215, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10797.0] + - - [4096, 3312, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11000.0] + - - [4096, 3581, 1, 1024, 4096, 4096, 1024, 1024] + - [49, 10730.0] + - - [4096, 3479, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10609.0] + - - [4096, 3544, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10625.0] + - - [1024, 3870, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10026.0] + - - [1024, 3374, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8540.0] + - - [1024, 2967, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9256.0] + - - [4096, 3455, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11009.0] + - - [4096, 3942, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9008.0] + - - [1024, 3528, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8403.0] + - - [4096, 3186, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11192.0] + - - [1024, 3976, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10284.0] + - - [1024, 3511, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8027.0] + - - [4096, 3573, 1, 1024, 4096, 4096, 1024, 1024] + - [51, 10816.0] + - - [4096, 3561, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10699.0] + - - [4096, 3418, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10888.0] + - - [33708, 3906, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10578.0] + - - [4096, 3259, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10904.0] + - - [4096, 3308, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11008.0] + - - [1024, 3419, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8656.0] + - - [1024, 3215, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8283.0] + - - [1024, 4030, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 7690.0] + - - [4096, 3459, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10617.0] + - - [1024, 3572, 1, 4096, 1024, 1024, 4096, 4096] + - [59, 8571.0] + - - [1024, 3137, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8697.0] + - - [1024, 3312, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8539.0] + - - [1024, 3925, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7994.0] + - - [1024, 3453, 1, 4096, 1024, 1024, 4096, 4096] + - [20, 9138.0] + - - [4096, 3435, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10921.0] + - - [1024, 3176, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8330.0] + - - [1024, 3444, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8685.0] + - - [4096, 3975, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8872.0] + - - [4096, 3182, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11717.0] + - - [1024, 3475, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8110.0] + - - [33708, 3955, 1, 1024, 33708, 33708, 1024, 1024] + - [40, 10614.0] + - - [4096, 3446, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10921.0] + - - [1024, 3138, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8644.0] + - - [1024, 3549, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8272.0] + - - [4096, 3287, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11065.0] + - - [1024, 3342, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8502.0] + - - [4096, 3519, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10794.0] + - - [4096, 3552, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10764.0] + - - [4096, 3859, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8771.0] + - - [33708, 3969, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10838.0] + - - [1024, 3369, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8656.0] + - - [4096, 3482, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11263.0] + - - [1024, 3306, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8370.0] + - - [1024, 3474, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8316.0] + - - [4096, 3377, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10841.0] + - - [4096, 3426, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10931.0] + - - [4096, 2935, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11386.0] + - - [4096, 3267, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11051.0] + - - [1024, 3299, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8442.0] + - - [1024, 3456, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8863.0] + - - [1024, 3280, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8370.0] + - - [1024, 3555, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8380.0] + - - [4096, 3499, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10748.0] + - - [4096, 3356, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10847.0] + - - [1024, 3412, 1, 4096, 1024, 1024, 4096, 4096] + - [39, 9334.0] + - - [1024, 2984, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9629.0] + - - [4096, 3141, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11053.0] + - - [4096, 3510, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10761.0] + - - [1024, 3995, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10314.0] + - - [1024, 3517, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8417.0] + - - [1024, 3455, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8651.0] + - - [1024, 3939, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10168.0] + - - [1024, 3447, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 9784.0] + - - [1024, 3969, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 7681.0] + - - [4096, 3527, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10614.0] + - - [4096, 3336, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10697.0] + - - [1024, 3191, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8247.0] + - - [1024, 3302, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8474.0] + - - [1024, 3337, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 9653.0] + - - [4096, 3290, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11079.0] + - - [1024, 3512, 1, 4096, 1024, 1024, 4096, 4096] + - [16, 8668.0] + - - [1024, 3433, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8522.0] + - - [4096, 3876, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9467.0] + - - [4096, 3490, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10718.0] + - - [4096, 3064, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11260.0] + - - [1024, 3508, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8415.0] + - - [1024, 3956, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7822.0] + - - [4096, 3417, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10849.0] + - - [1024, 3248, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8335.0] + - - [1024, 2499, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11201.0] + - - [1024, 3186, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8440.0] + - - [1024, 3180, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8412.0] + - - [4096, 3364, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10743.0] + - - [4096, 3976, 1, 1024, 4096, 4096, 1024, 1024] + - [21, 8744.0] + - - [4096, 3205, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10655.0] + - - [4096, 3318, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11002.0] + - - [1024, 3377, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 9469.0] + - - [1024, 3485, 1, 4096, 1024, 1024, 4096, 4096] + - [39, 9090.0] + - - [4096, 3181, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11213.0] + - - [4096, 3550, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10731.0] + - - [1024, 3534, 1, 4096, 1024, 1024, 4096, 4096] + - [59, 8460.0] + - - [1024, 3860, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10061.0] + - - [160, 160, 400, 64, 160, 160, 64, 64] + - [22, 4231.0] + - - [4096, 3445, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10899.0] + - - [1024, 3391, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8653.0] + - - [1024, 3221, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8364.0] + - - [4096, 3079, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11002.0] + - - [4096, 3144, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11148.0] + - - [1024, 3270, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8365.0] + - - [1024, 3561, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8394.0] + - - [1024, 3480, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8152.0] + - - [4096, 3408, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10532.0] + - - [1024, 3418, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8662.0] + - - [4096, 3298, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11067.0] + - - [1024, 3640, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 10709.0] + - - [1024, 3449, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8683.0] + - - [1024, 4020, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7748.0] + - - [4096, 3481, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10623.0] + - - [4096, 3530, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10772.0] + - - [1024, 3216, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8189.0] + - - [1024, 3491, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8319.0] + - - [1024, 3154, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8615.0] + - - [4096, 3425, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10872.0] + - - [1024, 3348, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8405.0] + - - [1024, 3415, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8409.0] + - - [1024, 4026, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10302.0] + - - [1024, 3367, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8525.0] + - - [1024, 3259, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8493.0] + - - [1024, 3894, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7907.0] + - - [4096, 3355, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10774.0] + - - [4096, 3404, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10984.0] + - - [1024, 3308, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8547.0] + - - [4096, 3245, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10965.0] + - - [1024, 3502, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8277.0] + - - [33708, 4032, 1, 1024, 33708, 33708, 1024, 1024] + - [40, 10713.0] + - - [1024, 3424, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8634.0] + - - [4096, 3509, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10587.0] + - - [4096, 3558, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10512.0] + - - [1024, 3900, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10237.0] + - - [1024, 2505, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 11042.0] + - - [4096, 3472, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10677.0] + - - [1024, 3386, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8425.0] + - - [4096, 3383, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10833.0] + - - [4096, 3448, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10902.0] + - - [4096, 4030, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8939.0] + - - [4096, 3289, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11106.0] + - - [1024, 3459, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8317.0] + - - [1024, 2918, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 9503.0] + - - [4096, 3489, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10759.0] + - - [4096, 3346, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10575.0] + - - [4096, 3572, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10566.0] + - - [1024, 3955, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7876.0] + - - [4096, 3236, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10859.0] + - - [4096, 3163, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11236.0] + - - [4096, 3468, 1, 1024, 4096, 4096, 1024, 1024] + - [49, 10691.0] + - - [1024, 3165, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8283.0] + - - [1024, 3276, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8423.0] + - - [1024, 3359, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8690.0] + - - [4096, 3363, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10765.0] + - - [1024, 3385, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8398.0] + - - [1024, 3207, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8282.0] + - - [1024, 3458, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8197.0] + - - [4096, 3110, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11074.0] + - - [4096, 3925, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8952.0] + - - [1024, 3975, 1, 4096, 1024, 1024, 4096, 4096] + - [21, 8247.0] + - - [4096, 3549, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10676.0] + - - [4096, 3342, 1, 1024, 4096, 4096, 1024, 1024] + - [49, 9789.0] + - - [1024, 3859, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10060.0] + - - [1024, 3497, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8206.0] + - - [4096, 3280, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11000.0] + - - [1024, 3435, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8584.0] + - - [1024, 3354, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8412.0] + - - [4096, 3191, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11059.0] + - - [4096, 3512, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10726.0] + - - [1024, 3055, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9099.0] + - - [4096, 2499, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11272.0] + - - [1024, 3233, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8475.0] + - - [4096, 3423, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11143.0] + - - [1024, 3319, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9316.0] + - - [4096, 3297, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11083.0] + - - [4096, 3154, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10881.0] + - - [1024, 3540, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8251.0] + - - [1024, 3289, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8538.0] + - - [4096, 3529, 1, 1024, 4096, 4096, 1024, 1024] + - [27, 9865.0] + - - [4096, 3386, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10889.0] + - - [4096, 3276, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11113.0] + - - [1024, 3244, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8413.0] + - - [1024, 3182, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 9354.0] + - - [4096, 3540, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10728.0] + - - [1024, 3360, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8375.0] + - - [1024, 3942, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8163.0] + - - [4096, 3403, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10082.0] + - - [4096, 3101, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10989.0] + - - [4096, 2918, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11521.0] + - - [1024, 3465, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 10220.0] + - - [33708, 3780, 1, 1024, 33708, 33708, 1024, 1024] + - [40, 10667.0] + - - [4096, 3557, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10672.0] + - - [4096, 3414, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10880.0] + - - [1024, 3948, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10846.0] + - - [4096, 3320, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10926.0] + - - [4096, 2765, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11092.0] + - - [1024, 3978, 1, 4096, 1024, 1024, 4096, 4096] + - [17, 7973.0] + - - [4096, 3487, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10806.0] + - - [4096, 3520, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10364.0] + - - [1024, 3139, 1, 4096, 1024, 1024, 4096, 4096] + - [43, 8459.0] + - - [1024, 3314, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8419.0] + - - [4096, 3431, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10919.0] + - - [1024, 3446, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8534.0] + - - [1024, 4059, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 7985.0] + - - [4096, 3345, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10723.0] + - - [4096, 3394, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10779.0] + - - [1024, 3927, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10312.0] + - - [4096, 3235, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11371.0] + - - [1024, 3328, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8457.0] + - - [33708, 3956, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10641.0] + - - [4096, 3467, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10628.0] + - - [1024, 3287, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8451.0] + - - [4096, 3214, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10894.0] + - - [4096, 3910, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9101.0] + - - [1024, 3780, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 11202.0] + - - [1024, 3371, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8497.0] + - - [4096, 3478, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10660.0] + - - [1024, 3546, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8180.0] + - - [1024, 4012, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10389.0] + - - [4096, 3341, 1, 1024, 4096, 4096, 1024, 1024] + - [27, 10131.0] + - - [4096, 3454, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10893.0] + - - [4096, 3295, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11068.0] + - - [4096, 3072, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11282.0] + - - [1024, 3282, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8347.0] + - - [33708, 3720, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10694.0] + - - [1024, 3681, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8688.0] + - - [1024, 4050, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 9490.0] + - - [4096, 3495, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10544.0] + - - [4096, 3560, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10615.0] + - - [4096, 3751, 1, 1024, 4096, 4096, 1024, 1024] + - [51, 10787.0] + - - [1024, 3414, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8798.0] + - - [33708, 3860, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10455.0] + - - [1024, 3325, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8543.0] + - - [4096, 3458, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10562.0] + - - [4096, 2967, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10932.0] + - - [1024, 3519, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 9370.0] + - - [4096, 3385, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10725.0] + - - [4096, 3434, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10889.0] + - - [1024, 3552, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8443.0] + - - [4096, 3822, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9754.0] + - - [1024, 3544, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8358.0] + - - [4096, 3539, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10626.0] + - - [4096, 3332, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10576.0] + - - [1024, 3145, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8686.0] + - - [1024, 3535, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8253.0] + - - [1024, 3320, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8744.0] + - - [33708, 4012, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 11744.0] + - - [4096, 3286, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11149.0] + - - [1024, 3514, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8280.0] + - - [1024, 2765, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9700.0] + - - [1024, 3452, 1, 4096, 1024, 1024, 4096, 4096] + - [20, 9043.0] + - - [4096, 3518, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10834.0] + - - [1024, 3529, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8393.0] + - - [4096, 3413, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10910.0] + - - [33708, 4050, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10923.0] + - - [1024, 3525, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 9239.0] + - - [4096, 3303, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11083.0] + - - [1024, 3382, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8488.0] + - - [1024, 3390, 1, 4096, 1024, 1024, 4096, 4096] + - [17, 8674.0] + - - [1024, 3977, 1, 4096, 1024, 1024, 4096, 4096] + - [37, 7898.0] + - - [1024, 3184, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 9372.0] + - - [4096, 3535, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10868.0] + - - [4096, 3376, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10698.0] + - - [4096, 3978, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8750.0] + - - [1024, 3136, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8516.0] + - - [1024, 3293, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8414.0] + - - [4096, 3266, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11481.0] + - - [1024, 3487, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8312.0] + - - [1024, 3409, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8573.0] + - - [4096, 3498, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10703.0] + - - [1024, 3520, 1, 4096, 1024, 1024, 4096, 4096] + - [9, 8801.0] + - - [1024, 3530, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8400.0] + - - [4096, 3393, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10578.0] + - - [4096, 3140, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11078.0] + - - [1024, 3536, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8396.0] + - - [1024, 3288, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8444.0] + - - [1024, 4005, 1, 4096, 1024, 1024, 4096, 4096] + - [7, 8421.0] + - - [1024, 3579, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 8467.0] + - - [4096, 3372, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10727.0] + - - [1024, 3440, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8515.0] + - - [4096, 3213, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10900.0] + - - [4096, 3477, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10672.0] + - - [4096, 3526, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10788.0] + - - [1024, 3493, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 10299.0] + - - [1024, 3944, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7909.0] + - - [4096, 3453, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10950.0] + - - [1024, 3350, 1, 4096, 1024, 1024, 4096, 4096] + - [10, 8531.0] + - - [4096, 3184, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10828.0] + - - [1024, 3423, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8678.0] + - - [4096, 3351, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10548.0] + - - [4096, 3416, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10795.0] + - - [1024, 3796, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 10563.0] + - - [4096, 3257, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11011.0] + - - [4096, 3306, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10989.0] + - - [33708, 4020, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 10792.0] + - - [1024, 3426, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8497.0] + - - [4096, 3457, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11167.0] + - - [1024, 2935, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 9697.0] + - - [1024, 3046, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 9226.0] + - - [4096, 3433, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10897.0] + - - [1024, 3256, 1, 4096, 1024, 1024, 4096, 4096] + - [63, 8406.0] + - - [1024, 3531, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8924.0] + - - [4096, 3180, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11263.0] + - - [1024, 3388, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8580.0] + - - [4096, 3444, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10670.0] + - - [1024, 3501, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8293.0] + - - [1024, 3266, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8337.0] + - - [1024, 3267, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8341.0] + - - [1024, 3461, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8155.0] + - - [4096, 3870, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9046.0] + - - [4096, 3517, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10663.0] + - - [1024, 3566, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8438.0] + - - [4096, 3574, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 10358.0] + - - [1024, 3876, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10172.0] + - - [4096, 3720, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9169.0] + - - [4096, 3248, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10935.0] + - - [4096, 4059, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8705.0] + - - [1024, 3380, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8318.0] + - - [4096, 3480, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10665.0] + - - [1024, 3335, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8321.0] + - - [1024, 3345, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8367.0] + - - [4096, 3391, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10803.0] + - - [4096, 3424, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10811.0] + - - [1024, 3394, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8446.0] + - - [4096, 3265, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11060.0] + - - [1024, 3014, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 9348.0] + - - [4096, 3497, 1, 1024, 4096, 4096, 1024, 1024] + - [17, 10185.0] + - - [4096, 3354, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10693.0] + - - [4096, 3055, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11248.0] + - - [1024, 3499, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8505.0] + - - [1024, 3162, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8459.0] + - - [4096, 3244, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11002.0] + - - [1024, 3437, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8746.0] + - - [1024, 3356, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8299.0] + - - [4096, 3139, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11169.0] + - - [4096, 3508, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10623.0] + - - [1024, 3235, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8296.0] + - - [1024, 3910, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 7939.0] + - - [4096, 3371, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10695.0] + - - [1024, 3751, 1, 4096, 1024, 1024, 4096, 4096] + - [42, 8554.0] + - - [4096, 3325, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11028.0] + - - [1024, 3413, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8782.0] + - - [1024, 3542, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8289.0] + - - [33708, 3900, 1, 1024, 33708, 33708, 1024, 1024] + - [58, 10674.0] + - - [4096, 3525, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10774.0] + - - [4096, 3382, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11033.0] + - - [1024, 3339, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8370.0] + - - [4096, 3288, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11061.0] + - - [1024, 3141, 1, 4096, 1024, 1024, 4096, 4096] + - [39, 9268.0] + - - [1024, 3168, 1, 4096, 1024, 1024, 4096, 4096] + - [31, 8493.0] + - - [4096, 3488, 1, 1024, 4096, 4096, 1024, 1024] + - [17, 10270.0] + - - [4096, 3046, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11227.0] + - - [1024, 3362, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8563.0] + - - [33708, 3942, 1, 1024, 33708, 33708, 1024, 1024] + - [60, 11661.0] + - - [4096, 3399, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10782.0] + - - [1024, 3720, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10604.0] + - - [4096, 3563, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10707.0] + - - [1024, 3273, 1, 4096, 1024, 1024, 4096, 4096] + - [39, 9091.0] + - - [4096, 3162, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11056.0] + - - [1024, 3467, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8351.0] + - - [1024, 3130, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8778.0] + - - [1024, 3405, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 8658.0] + - - [4096, 3362, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10739.0] + - - [1024, 3960, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10280.0] + - - [1024, 3712, 1, 36548, 1024, 1024, 36548, 36548] + - [3, 11677.0] + - - [1024, 3712, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 10552.0] + - - [4032, 384, 1, 64, 4032, 4032, 64, 64] + - [55, 5586.0] + - - [1024, 2048, 1, 49, 1024, 1024, 49, 49] + - [0, 5677.0] + - - [4608, 512, 1, 49, 4608, 4608, 49, 49] + - [24, 5928.0] + - - [9216, 512, 1, 4096, 9216, 9216, 4096, 4096] + - [62, 5739.0] + - - [3456, 384, 1, 289, 3456, 3456, 289, 289] + - [22, 7378.0] + - - [3456, 384, 1, 169, 3456, 3456, 169, 169] + - [44, 8108.0] + - - [4096, 512, 1, 1001, 4096, 4096, 1001, 1001] + - [34, 10597.0] + - - [384, 448, 49, 512, 384, 384, 512, 512] + - [38, 9787.0] + - - [384, 448, 64, 256, 384, 384, 256, 256] + - [58, 9874.0] + - - [384, 448, 36, 256, 384, 384, 256, 256] + - [58, 9670.0] + - - [384, 448, 49, 256, 384, 384, 256, 256] + - [49, 9869.0] + - - [384, 448, 64, 512, 384, 384, 512, 512] + - [38, 10128.0] + - - [384, 448, 36, 512, 384, 384, 512, 512] + - [16, 9852.0] + - - [1024, 6400, 1, 65, 1024, 1024, 65, 65] + - [11, 7585.0] + - - [4096, 6400, 1, 256, 4096, 4096, 256, 256] + - [16, 11680.0] + - - [512, 3194, 1, 2048, 512, 512, 2048, 2048] + - [58, 9813.0] + - - [512, 3222, 1, 2048, 512, 512, 2048, 2048] + - [18, 9777.0] + - - [512, 3234, 1, 2048, 512, 512, 2048, 2048] + - [40, 9821.0] + - - [512, 3242, 1, 2048, 512, 512, 2048, 2048] + - [40, 9822.0] + - - [512, 3257, 1, 2048, 512, 512, 2048, 2048] + - [60, 9763.0] + - - [512, 3332, 1, 2048, 512, 512, 2048, 2048] + - [18, 10145.0] + - - [512, 3336, 1, 2048, 512, 512, 2048, 2048] + - [60, 9666.0] + - - [512, 3378, 1, 2048, 512, 512, 2048, 2048] + - [18, 10238.0] + - - [512, 3396, 1, 2048, 512, 512, 2048, 2048] + - [16, 9825.0] + - - [512, 3399, 1, 2048, 512, 512, 2048, 2048] + - [18, 10276.0] + - - [512, 3451, 1, 2048, 512, 512, 2048, 2048] + - [18, 10461.0] + - - [512, 3456, 1, 2048, 512, 512, 2048, 2048] + - [40, 10463.0] + - - [512, 3458, 1, 2048, 512, 512, 2048, 2048] + - [40, 10438.0] + - - [512, 3467, 1, 2048, 512, 512, 2048, 2048] + - [60, 10390.0] + - - [512, 3468, 1, 2048, 512, 512, 2048, 2048] + - [40, 10441.0] + - - [512, 3470, 1, 2048, 512, 512, 2048, 2048] + - [60, 10121.0] + - - [512, 3477, 1, 2048, 512, 512, 2048, 2048] + - [60, 10469.0] + - - [512, 3478, 1, 2048, 512, 512, 2048, 2048] + - [60, 10012.0] + - - [512, 3495, 1, 2048, 512, 512, 2048, 2048] + - [18, 10533.0] + - - [512, 3507, 1, 2048, 512, 512, 2048, 2048] + - [18, 10633.0] + - - [512, 3515, 1, 2048, 512, 512, 2048, 2048] + - [40, 10488.0] + - - [512, 3517, 1, 2048, 512, 512, 2048, 2048] + - [40, 10586.0] + - - [2048, 2864, 1, 512, 2048, 2048, 512, 512] + - [49, 10270.0] + - - [2048, 3287, 1, 512, 2048, 2048, 512, 512] + - [16, 10832.0] + - - [2048, 3412, 1, 512, 2048, 2048, 512, 512] + - [38, 9778.0] + - - [2048, 3456, 1, 512, 2048, 2048, 512, 512] + - [16, 11006.0] + - - [2048, 3466, 1, 512, 2048, 2048, 512, 512] + - [16, 10572.0] + - - [2048, 3476, 1, 512, 2048, 2048, 512, 512] + - [27, 10671.0] + - - [2048, 3999, 1, 512, 2048, 2048, 512, 512] + - [18, 10942.0] + - - [33708, 189, 1, 512, 33708, 33708, 512, 512] + - [57, 9292.0] + - - [33708, 2496, 1, 512, 33708, 33708, 512, 512] + - [16, 8803.0] + - - [33708, 3864, 1, 512, 33708, 33708, 512, 512] + - [38, 9603.0] + - - [33708, 3969, 1, 512, 33708, 33708, 512, 512] + - [18, 11083.0] + - - [33708, 3995, 1, 512, 33708, 33708, 512, 512] + - [18, 9808.0] + - - [134, 134, 240, 64, 134, 134, 64, 64] + - [44, 3022.0] + - - [135, 134, 240, 64, 135, 135, 64, 64] + - [44, 3709.0] + - - [135, 135, 240, 64, 135, 135, 64, 64] + - [44, 3681.0] + - - [512, 2790, 1, 2048, 512, 512, 2048, 2048] + - [38, 9603.0] + - - [512, 2864, 1, 2048, 512, 512, 2048, 2048] + - [58, 9850.0] + - - [512, 3092, 1, 2048, 512, 512, 2048, 2048] + - [38, 10565.0] + - - [512, 3113, 1, 2048, 512, 512, 2048, 2048] + - [38, 9532.0] + - - [512, 3137, 1, 2048, 512, 512, 2048, 2048] + - [38, 10755.0] + - - [512, 3165, 1, 2048, 512, 512, 2048, 2048] + - [16, 10696.0] + - - [512, 3166, 1, 2048, 512, 512, 2048, 2048] + - [58, 10762.0] + - - [512, 3219, 1, 2048, 512, 512, 2048, 2048] + - [60, 9843.0] + - - [512, 3237, 1, 2048, 512, 512, 2048, 2048] + - [60, 9352.0] + - - [512, 3246, 1, 2048, 512, 512, 2048, 2048] + - [18, 9837.0] + - - [512, 3249, 1, 2048, 512, 512, 2048, 2048] + - [18, 9871.0] + - - [512, 3251, 1, 2048, 512, 512, 2048, 2048] + - [40, 9780.0] + - - [512, 3262, 1, 2048, 512, 512, 2048, 2048] + - [40, 9857.0] + - - [512, 3268, 1, 2048, 512, 512, 2048, 2048] + - [60, 9808.0] + - - [512, 3282, 1, 2048, 512, 512, 2048, 2048] + - [18, 9924.0] + - - [512, 3286, 1, 2048, 512, 512, 2048, 2048] + - [60, 9414.0] + - - [512, 3287, 1, 2048, 512, 512, 2048, 2048] + - [60, 9901.0] + - - [512, 3293, 1, 2048, 512, 512, 2048, 2048] + - [18, 9974.0] + - - [512, 3297, 1, 2048, 512, 512, 2048, 2048] + - [60, 9949.0] + - - [512, 3307, 1, 2048, 512, 512, 2048, 2048] + - [18, 9767.0] + - - [512, 3314, 1, 2048, 512, 512, 2048, 2048] + - [60, 9925.0] + - - [512, 3315, 1, 2048, 512, 512, 2048, 2048] + - [60, 10025.0] + - - [512, 3319, 1, 2048, 512, 512, 2048, 2048] + - [60, 9930.0] + - - [512, 3322, 1, 2048, 512, 512, 2048, 2048] + - [60, 10000.0] + - - [512, 3323, 1, 2048, 512, 512, 2048, 2048] + - [60, 9530.0] + - - [512, 3324, 1, 2048, 512, 512, 2048, 2048] + - [18, 10075.0] + - - [512, 3325, 1, 2048, 512, 512, 2048, 2048] + - [18, 10033.0] + - - [512, 3327, 1, 2048, 512, 512, 2048, 2048] + - [60, 10005.0] + - - [512, 3329, 1, 2048, 512, 512, 2048, 2048] + - [40, 10064.0] + - - [512, 3339, 1, 2048, 512, 512, 2048, 2048] + - [60, 10033.0] + - - [512, 3342, 1, 2048, 512, 512, 2048, 2048] + - [18, 10149.0] + - - [512, 3344, 1, 2048, 512, 512, 2048, 2048] + - [60, 9730.0] + - - [512, 3358, 1, 2048, 512, 512, 2048, 2048] + - [40, 10147.0] + - - [512, 3360, 1, 2048, 512, 512, 2048, 2048] + - [16, 9774.0] + - - [512, 3364, 1, 2048, 512, 512, 2048, 2048] + - [40, 10156.0] + - - [512, 3365, 1, 2048, 512, 512, 2048, 2048] + - [18, 10214.0] + - - [512, 3369, 1, 2048, 512, 512, 2048, 2048] + - [40, 10159.0] + - - [512, 3371, 1, 2048, 512, 512, 2048, 2048] + - [40, 10198.0] + - - [512, 3374, 1, 2048, 512, 512, 2048, 2048] + - [60, 10137.0] + - - [512, 3376, 1, 2048, 512, 512, 2048, 2048] + - [18, 10215.0] + - - [512, 3377, 1, 2048, 512, 512, 2048, 2048] + - [60, 9747.0] + - - [512, 3381, 1, 2048, 512, 512, 2048, 2048] + - [18, 10232.0] + - - [512, 3382, 1, 2048, 512, 512, 2048, 2048] + - [18, 10234.0] + - - [512, 3383, 1, 2048, 512, 512, 2048, 2048] + - [40, 10199.0] + - - [512, 3384, 1, 2048, 512, 512, 2048, 2048] + - [18, 10216.0] + - - [512, 3385, 1, 2048, 512, 512, 2048, 2048] + - [40, 10211.0] + - - [512, 3386, 1, 2048, 512, 512, 2048, 2048] + - [18, 10218.0] + - - [512, 3388, 1, 2048, 512, 512, 2048, 2048] + - [60, 10210.0] + - - [512, 3390, 1, 2048, 512, 512, 2048, 2048] + - [60, 10253.0] + - - [512, 3391, 1, 2048, 512, 512, 2048, 2048] + - [60, 9785.0] + - - [512, 3402, 1, 2048, 512, 512, 2048, 2048] + - [60, 10270.0] + - - [512, 3410, 1, 2048, 512, 512, 2048, 2048] + - [18, 10350.0] + - - [512, 3412, 1, 2048, 512, 512, 2048, 2048] + - [40, 10289.0] + - - [512, 3414, 1, 2048, 512, 512, 2048, 2048] + - [18, 10324.0] + - - [512, 3415, 1, 2048, 512, 512, 2048, 2048] + - [40, 10301.0] + - - [512, 3418, 1, 2048, 512, 512, 2048, 2048] + - [18, 10332.0] + - - [512, 3420, 1, 2048, 512, 512, 2048, 2048] + - [40, 10240.0] + - - [512, 3422, 1, 2048, 512, 512, 2048, 2048] + - [40, 10353.0] + - - [512, 3425, 1, 2048, 512, 512, 2048, 2048] + - [60, 10342.0] + - - [512, 3426, 1, 2048, 512, 512, 2048, 2048] + - [18, 10400.0] + - - [512, 3427, 1, 2048, 512, 512, 2048, 2048] + - [60, 9853.0] + - - [512, 3428, 1, 2048, 512, 512, 2048, 2048] + - [18, 10371.0] + - - [512, 3430, 1, 2048, 512, 512, 2048, 2048] + - [18, 10415.0] + - - [512, 3431, 1, 2048, 512, 512, 2048, 2048] + - [40, 10329.0] + - - [512, 3432, 1, 2048, 512, 512, 2048, 2048] + - [40, 10357.0] + - - [512, 3438, 1, 2048, 512, 512, 2048, 2048] + - [60, 10353.0] + - - [512, 3439, 1, 2048, 512, 512, 2048, 2048] + - [18, 10398.0] + - - [512, 3440, 1, 2048, 512, 512, 2048, 2048] + - [60, 10338.0] + - - [512, 3443, 1, 2048, 512, 512, 2048, 2048] + - [18, 10404.0] + - - [512, 3445, 1, 2048, 512, 512, 2048, 2048] + - [60, 9942.0] + - - [512, 3447, 1, 2048, 512, 512, 2048, 2048] + - [40, 10455.0] + - - [512, 3448, 1, 2048, 512, 512, 2048, 2048] + - [18, 10373.0] + - - [512, 3450, 1, 2048, 512, 512, 2048, 2048] + - [40, 10430.0] + - - [512, 3452, 1, 2048, 512, 512, 2048, 2048] + - [40, 10404.0] + - - [512, 3453, 1, 2048, 512, 512, 2048, 2048] + - [60, 10320.0] + - - [512, 3455, 1, 2048, 512, 512, 2048, 2048] + - [18, 10437.0] + - - [512, 3457, 1, 2048, 512, 512, 2048, 2048] + - [60, 10392.0] + - - [512, 3459, 1, 2048, 512, 512, 2048, 2048] + - [18, 10482.0] + - - [512, 3460, 1, 2048, 512, 512, 2048, 2048] + - [60, 9943.0] + - - [512, 3461, 1, 2048, 512, 512, 2048, 2048] + - [18, 10471.0] + - - [512, 3462, 1, 2048, 512, 512, 2048, 2048] + - [18, 10513.0] + - - [512, 3466, 1, 2048, 512, 512, 2048, 2048] + - [40, 10501.0] + - - [512, 3471, 1, 2048, 512, 512, 2048, 2048] + - [18, 10315.0] + - - [512, 3472, 1, 2048, 512, 512, 2048, 2048] + - [60, 10411.0] + - - [512, 3475, 1, 2048, 512, 512, 2048, 2048] + - [40, 10488.0] + - - [512, 3476, 1, 2048, 512, 512, 2048, 2048] + - [60, 10434.0] + - - [512, 3479, 1, 2048, 512, 512, 2048, 2048] + - [40, 10468.0] + - - [512, 3480, 1, 2048, 512, 512, 2048, 2048] + - [60, 9996.0] + - - [512, 3481, 1, 2048, 512, 512, 2048, 2048] + - [18, 10525.0] + - - [512, 3483, 1, 2048, 512, 512, 2048, 2048] + - [18, 10576.0] + - - [512, 3484, 1, 2048, 512, 512, 2048, 2048] + - [40, 10499.0] + - - [512, 3487, 1, 2048, 512, 512, 2048, 2048] + - [18, 10514.0] + - - [512, 3489, 1, 2048, 512, 512, 2048, 2048] + - [40, 10544.0] + - - [512, 3490, 1, 2048, 512, 512, 2048, 2048] + - [40, 10541.0] + - - [512, 3491, 1, 2048, 512, 512, 2048, 2048] + - [60, 10391.0] + - - [512, 3493, 1, 2048, 512, 512, 2048, 2048] + - [18, 10551.0] + - - [512, 3494, 1, 2048, 512, 512, 2048, 2048] + - [60, 10008.0] + - - [512, 3497, 1, 2048, 512, 512, 2048, 2048] + - [60, 10255.0] + - - [512, 3498, 1, 2048, 512, 512, 2048, 2048] + - [18, 10403.0] + - - [512, 3499, 1, 2048, 512, 512, 2048, 2048] + - [60, 10550.0] + - - [512, 3501, 1, 2048, 512, 512, 2048, 2048] + - [18, 10535.0] + - - [512, 3503, 1, 2048, 512, 512, 2048, 2048] + - [60, 10421.0] + - - [512, 3508, 1, 2048, 512, 512, 2048, 2048] + - [18, 10561.0] + - - [512, 3509, 1, 2048, 512, 512, 2048, 2048] + - [60, 10460.0] + - - [512, 3511, 1, 2048, 512, 512, 2048, 2048] + - [40, 10565.0] + - - [512, 3514, 1, 2048, 512, 512, 2048, 2048] + - [60, 10102.0] + - - [512, 3518, 1, 2048, 512, 512, 2048, 2048] + - [40, 10635.0] + - - [512, 3519, 1, 2048, 512, 512, 2048, 2048] + - [16, 10172.0] + - - [512, 3520, 1, 2048, 512, 512, 2048, 2048] + - [60, 10546.0] + - - [512, 3523, 1, 2048, 512, 512, 2048, 2048] + - [18, 10606.0] + - - [512, 3528, 1, 2048, 512, 512, 2048, 2048] + - [40, 10589.0] + - - [512, 3529, 1, 2048, 512, 512, 2048, 2048] + - [18, 10649.0] + - - [512, 3530, 1, 2048, 512, 512, 2048, 2048] + - [40, 10578.0] + - - [512, 3532, 1, 2048, 512, 512, 2048, 2048] + - [18, 10606.0] + - - [512, 3533, 1, 2048, 512, 512, 2048, 2048] + - [60, 10542.0] + - - [512, 3534, 1, 2048, 512, 512, 2048, 2048] + - [18, 10647.0] + - - [512, 3538, 1, 2048, 512, 512, 2048, 2048] + - [60, 10586.0] + - - [512, 3539, 1, 2048, 512, 512, 2048, 2048] + - [60, 10601.0] + - - [512, 3541, 1, 2048, 512, 512, 2048, 2048] + - [60, 10148.0] + - - [512, 3547, 1, 2048, 512, 512, 2048, 2048] + - [18, 10648.0] + - - [512, 3548, 1, 2048, 512, 512, 2048, 2048] + - [18, 10637.0] + - - [512, 3552, 1, 2048, 512, 512, 2048, 2048] + - [60, 10584.0] + - - [512, 3564, 1, 2048, 512, 512, 2048, 2048] + - [18, 10495.0] + - - [512, 3575, 1, 2048, 512, 512, 2048, 2048] + - [40, 10690.0] + - - [512, 3598, 1, 2048, 512, 512, 2048, 2048] + - [40, 10853.0] + - - [512, 3599, 1, 2048, 512, 512, 2048, 2048] + - [60, 10794.0] + - - [512, 3608, 1, 2048, 512, 512, 2048, 2048] + - [18, 10796.0] + - - [512, 3780, 1, 512, 512, 512, 512, 512] + - [58, 9815.0] + - - [512, 3780, 1, 2048, 512, 512, 2048, 2048] + - [18, 11227.0] + - - [512, 3796, 1, 512, 512, 512, 512, 512] + - [18, 9991.0] + - - [512, 3796, 1, 2048, 512, 512, 2048, 2048] + - [18, 11360.0] + - - [512, 3822, 1, 512, 512, 512, 512, 512] + - [34, 9898.0] + - - [512, 3822, 1, 2048, 512, 512, 2048, 2048] + - [40, 11426.0] + - - [512, 3840, 1, 512, 512, 512, 512, 512] + - [40, 10451.0] + - - [512, 3840, 1, 2048, 512, 512, 2048, 2048] + - [60, 11455.0] + - - [512, 3859, 1, 512, 512, 512, 512, 512] + - [59, 8533.0] + - - [512, 3859, 1, 2048, 512, 512, 2048, 2048] + - [16, 9772.0] + - - [512, 3870, 1, 512, 512, 512, 512, 512] + - [38, 9203.0] + - - [512, 3870, 1, 2048, 512, 512, 2048, 2048] + - [38, 9820.0] + - - [512, 3876, 1, 512, 512, 512, 512, 512] + - [34, 9219.0] + - - [512, 3876, 1, 2048, 512, 512, 2048, 2048] + - [58, 9723.0] + - - [512, 3906, 1, 512, 512, 512, 512, 512] + - [38, 9307.0] + - - [512, 3906, 1, 2048, 512, 512, 2048, 2048] + - [16, 9836.0] + - - [512, 3910, 1, 512, 512, 512, 512, 512] + - [34, 9313.0] + - - [512, 3910, 1, 2048, 512, 512, 2048, 2048] + - [38, 9896.0] + - - [512, 3925, 1, 512, 512, 512, 512, 512] + - [34, 9354.0] + - - [512, 3925, 1, 2048, 512, 512, 2048, 2048] + - [58, 9811.0] + - - [512, 3927, 1, 512, 512, 512, 512, 512] + - [38, 9398.0] + - - [512, 3942, 1, 512, 512, 512, 512, 512] + - [54, 9359.0] + - - [512, 3942, 1, 2048, 512, 512, 2048, 2048] + - [38, 9711.0] + - - [512, 3944, 1, 512, 512, 512, 512, 512] + - [38, 9338.0] + - - [512, 3944, 1, 2048, 512, 512, 2048, 2048] + - [38, 9995.0] + - - [512, 3955, 1, 512, 512, 512, 512, 512] + - [58, 9344.0] + - - [512, 3955, 1, 2048, 512, 512, 2048, 2048] + - [58, 9923.0] + - - [512, 3968, 1, 512, 512, 512, 512, 512] + - [34, 9606.0] + - - [512, 3968, 1, 2048, 512, 512, 2048, 2048] + - [16, 10139.0] + - - [512, 3969, 1, 512, 512, 512, 512, 512] + - [39, 8828.0] + - - [512, 3969, 1, 2048, 512, 512, 2048, 2048] + - [38, 10065.0] + - - [512, 3976, 1, 512, 512, 512, 512, 512] + - [58, 9119.0] + - - [512, 3976, 1, 2048, 512, 512, 2048, 2048] + - [38, 10084.0] + - - [512, 3977, 1, 512, 512, 512, 512, 512] + - [34, 9660.0] + - - [512, 3977, 1, 2048, 512, 512, 2048, 2048] + - [38, 9737.0] + - - [512, 3978, 1, 512, 512, 512, 512, 512] + - [38, 9378.0] + - - [512, 3978, 1, 2048, 512, 512, 2048, 2048] + - [38, 10101.0] + - - [512, 3990, 1, 512, 512, 512, 512, 512] + - [58, 9167.0] + - - [512, 3990, 1, 2048, 512, 512, 2048, 2048] + - [16, 10056.0] + - - [512, 3995, 1, 512, 512, 512, 512, 512] + - [34, 9437.0] + - - [512, 3995, 1, 2048, 512, 512, 2048, 2048] + - [16, 10038.0] + - - [512, 3996, 1, 512, 512, 512, 512, 512] + - [58, 9386.0] + - - [512, 3996, 1, 2048, 512, 512, 2048, 2048] + - [58, 10123.0] + - - [512, 3999, 1, 512, 512, 512, 512, 512] + - [58, 9297.0] + - - [512, 3999, 1, 2048, 512, 512, 2048, 2048] + - [38, 10124.0] + - - [512, 4005, 1, 512, 512, 512, 512, 512] + - [38, 9474.0] + - - [512, 4005, 1, 2048, 512, 512, 2048, 2048] + - [16, 10008.0] + - - [512, 4012, 1, 512, 512, 512, 512, 512] + - [34, 9379.0] + - - [512, 4012, 1, 2048, 512, 512, 2048, 2048] + - [16, 10105.0] + - - [512, 4020, 1, 512, 512, 512, 512, 512] + - [55, 8479.0] + - - [512, 4020, 1, 2048, 512, 512, 2048, 2048] + - [58, 10207.0] + - - [512, 4026, 1, 512, 512, 512, 512, 512] + - [38, 9443.0] + - - [512, 4026, 1, 2048, 512, 512, 2048, 2048] + - [16, 10104.0] + - - [512, 4030, 1, 512, 512, 512, 512, 512] + - [58, 9526.0] + - - [512, 4030, 1, 2048, 512, 512, 2048, 2048] + - [58, 10216.0] + - - [512, 4032, 1, 512, 512, 512, 512, 512] + - [5, 9205.0] + - - [512, 4032, 1, 2048, 512, 512, 2048, 2048] + - [58, 10198.0] + - - [512, 4050, 1, 512, 512, 512, 512, 512] + - [34, 9561.0] + - - [512, 4059, 1, 512, 512, 512, 512, 512] + - [34, 9694.0] + - - [2048, 2790, 1, 512, 2048, 2048, 512, 512] + - [27, 10745.0] + - - [2048, 3092, 1, 512, 2048, 2048, 512, 512] + - [27, 10551.0] + - - [2048, 3113, 1, 512, 2048, 2048, 512, 512] + - [38, 10737.0] + - - [2048, 3137, 1, 512, 2048, 2048, 512, 512] + - [58, 10404.0] + - - [2048, 3165, 1, 512, 2048, 2048, 512, 512] + - [16, 10855.0] + - - [2048, 3166, 1, 512, 2048, 2048, 512, 512] + - [18, 10220.0] + - - [2048, 3194, 1, 512, 2048, 2048, 512, 512] + - [27, 10830.0] + - - [2048, 3219, 1, 512, 2048, 2048, 512, 512] + - [16, 10580.0] + - - [2048, 3222, 1, 512, 2048, 2048, 512, 512] + - [58, 10065.0] + - - [2048, 3234, 1, 512, 2048, 2048, 512, 512] + - [27, 10840.0] + - - [2048, 3237, 1, 512, 2048, 2048, 512, 512] + - [16, 11010.0] + - - [2048, 3242, 1, 512, 2048, 2048, 512, 512] + - [16, 10576.0] + - - [2048, 3246, 1, 512, 2048, 2048, 512, 512] + - [16, 10742.0] + - - [2048, 3249, 1, 512, 2048, 2048, 512, 512] + - [16, 10841.0] + - - [2048, 3251, 1, 512, 2048, 2048, 512, 512] + - [16, 10737.0] + - - [2048, 3257, 1, 512, 2048, 2048, 512, 512] + - [16, 10705.0] + - - [2048, 3262, 1, 512, 2048, 2048, 512, 512] + - [27, 10363.0] + - - [2048, 3268, 1, 512, 2048, 2048, 512, 512] + - [27, 10737.0] + - - [2048, 3282, 1, 512, 2048, 2048, 512, 512] + - [58, 10350.0] + - - [2048, 3286, 1, 512, 2048, 2048, 512, 512] + - [38, 10764.0] + - - [2048, 3293, 1, 512, 2048, 2048, 512, 512] + - [27, 10921.0] + - - [2048, 3297, 1, 512, 2048, 2048, 512, 512] + - [16, 11063.0] + - - [2048, 3307, 1, 512, 2048, 2048, 512, 512] + - [16, 10873.0] + - - [2048, 3314, 1, 512, 2048, 2048, 512, 512] + - [16, 11220.0] + - - [2048, 3315, 1, 512, 2048, 2048, 512, 512] + - [16, 10907.0] + - - [2048, 3319, 1, 512, 2048, 2048, 512, 512] + - [38, 11007.0] + - - [2048, 3322, 1, 512, 2048, 2048, 512, 512] + - [16, 11182.0] + - - [2048, 3323, 1, 512, 2048, 2048, 512, 512] + - [16, 10911.0] + - - [2048, 3324, 1, 512, 2048, 2048, 512, 512] + - [16, 11253.0] + - - [2048, 3325, 1, 512, 2048, 2048, 512, 512] + - [16, 10848.0] + - - [2048, 3327, 1, 512, 2048, 2048, 512, 512] + - [16, 10941.0] + - - [2048, 3329, 1, 512, 2048, 2048, 512, 512] + - [49, 10284.0] + - - [2048, 3332, 1, 512, 2048, 2048, 512, 512] + - [16, 10581.0] + - - [2048, 3336, 1, 512, 2048, 2048, 512, 512] + - [16, 10849.0] + - - [2048, 3339, 1, 512, 2048, 2048, 512, 512] + - [16, 10555.0] + - - [2048, 3342, 1, 512, 2048, 2048, 512, 512] + - [16, 10606.0] + - - [2048, 3344, 1, 512, 2048, 2048, 512, 512] + - [16, 10773.0] + - - [2048, 3358, 1, 512, 2048, 2048, 512, 512] + - [16, 10597.0] + - - [2048, 3360, 1, 512, 2048, 2048, 512, 512] + - [16, 10905.0] + - - [2048, 3364, 1, 512, 2048, 2048, 512, 512] + - [16, 10555.0] + - - [2048, 3365, 1, 512, 2048, 2048, 512, 512] + - [27, 10755.0] + - - [2048, 3369, 1, 512, 2048, 2048, 512, 512] + - [16, 10897.0] + - - [2048, 3371, 1, 512, 2048, 2048, 512, 512] + - [16, 10679.0] + - - [2048, 3374, 1, 512, 2048, 2048, 512, 512] + - [16, 10997.0] + - - [2048, 3376, 1, 512, 2048, 2048, 512, 512] + - [38, 10723.0] + - - [2048, 3377, 1, 512, 2048, 2048, 512, 512] + - [16, 10625.0] + - - [2048, 3378, 1, 512, 2048, 2048, 512, 512] + - [49, 10556.0] + - - [2048, 3381, 1, 512, 2048, 2048, 512, 512] + - [16, 10625.0] + - - [2048, 3382, 1, 512, 2048, 2048, 512, 512] + - [16, 11011.0] + - - [2048, 3383, 1, 512, 2048, 2048, 512, 512] + - [16, 10646.0] + - - [2048, 3384, 1, 512, 2048, 2048, 512, 512] + - [16, 10698.0] + - - [2048, 3385, 1, 512, 2048, 2048, 512, 512] + - [49, 10561.0] + - - [2048, 3386, 1, 512, 2048, 2048, 512, 512] + - [27, 10684.0] + - - [2048, 3388, 1, 512, 2048, 2048, 512, 512] + - [16, 11039.0] + - - [2048, 3390, 1, 512, 2048, 2048, 512, 512] + - [16, 10745.0] + - - [2048, 3391, 1, 512, 2048, 2048, 512, 512] + - [16, 10700.0] + - - [2048, 3396, 1, 512, 2048, 2048, 512, 512] + - [38, 10616.0] + - - [2048, 3399, 1, 512, 2048, 2048, 512, 512] + - [38, 10757.0] + - - [2048, 3402, 1, 512, 2048, 2048, 512, 512] + - [38, 9781.0] + - - [2048, 3410, 1, 512, 2048, 2048, 512, 512] + - [16, 10820.0] + - - [2048, 3414, 1, 512, 2048, 2048, 512, 512] + - [27, 10877.0] + - - [2048, 3415, 1, 512, 2048, 2048, 512, 512] + - [38, 10724.0] + - - [2048, 3418, 1, 512, 2048, 2048, 512, 512] + - [16, 10825.0] + - - [2048, 3420, 1, 512, 2048, 2048, 512, 512] + - [16, 11052.0] + - - [2048, 3422, 1, 512, 2048, 2048, 512, 512] + - [16, 10838.0] + - - [2048, 3425, 1, 512, 2048, 2048, 512, 512] + - [27, 11007.0] + - - [2048, 3426, 1, 512, 2048, 2048, 512, 512] + - [38, 10713.0] + - - [2048, 3427, 1, 512, 2048, 2048, 512, 512] + - [38, 10834.0] + - - [2048, 3428, 1, 512, 2048, 2048, 512, 512] + - [49, 10575.0] + - - [2048, 3430, 1, 512, 2048, 2048, 512, 512] + - [38, 10270.0] + - - [2048, 3431, 1, 512, 2048, 2048, 512, 512] + - [16, 11207.0] + - - [2048, 3432, 1, 512, 2048, 2048, 512, 512] + - [49, 10822.0] + - - [2048, 3438, 1, 512, 2048, 2048, 512, 512] + - [27, 11018.0] + - - [2048, 3439, 1, 512, 2048, 2048, 512, 512] + - [16, 11166.0] + - - [2048, 3440, 1, 512, 2048, 2048, 512, 512] + - [16, 10866.0] + - - [2048, 3443, 1, 512, 2048, 2048, 512, 512] + - [16, 11251.0] + - - [2048, 3445, 1, 512, 2048, 2048, 512, 512] + - [27, 10959.0] + - - [2048, 3447, 1, 512, 2048, 2048, 512, 512] + - [16, 10877.0] + - - [2048, 3448, 1, 512, 2048, 2048, 512, 512] + - [27, 10732.0] + - - [2048, 3450, 1, 512, 2048, 2048, 512, 512] + - [16, 10972.0] + - - [2048, 3451, 1, 512, 2048, 2048, 512, 512] + - [58, 10331.0] + - - [2048, 3452, 1, 512, 2048, 2048, 512, 512] + - [16, 10969.0] + - - [2048, 3453, 1, 512, 2048, 2048, 512, 512] + - [16, 11087.0] + - - [2048, 3455, 1, 512, 2048, 2048, 512, 512] + - [38, 10888.0] + - - [2048, 3457, 1, 512, 2048, 2048, 512, 512] + - [38, 10435.0] + - - [2048, 3458, 1, 512, 2048, 2048, 512, 512] + - [49, 10266.0] + - - [2048, 3459, 1, 512, 2048, 2048, 512, 512] + - [27, 10516.0] + - - [2048, 3460, 1, 512, 2048, 2048, 512, 512] + - [16, 10721.0] + - - [2048, 3461, 1, 512, 2048, 2048, 512, 512] + - [27, 10656.0] + - - [2048, 3462, 1, 512, 2048, 2048, 512, 512] + - [38, 10713.0] + - - [2048, 3467, 1, 512, 2048, 2048, 512, 512] + - [16, 10851.0] + - - [2048, 3468, 1, 512, 2048, 2048, 512, 512] + - [27, 10685.0] + - - [2048, 3470, 1, 512, 2048, 2048, 512, 512] + - [38, 9984.0] + - - [2048, 3471, 1, 512, 2048, 2048, 512, 512] + - [27, 10438.0] + - - [2048, 3472, 1, 512, 2048, 2048, 512, 512] + - [16, 10548.0] + - - [2048, 3475, 1, 512, 2048, 2048, 512, 512] + - [27, 10531.0] + - - [2048, 3477, 1, 512, 2048, 2048, 512, 512] + - [16, 10564.0] + - - [2048, 3478, 1, 512, 2048, 2048, 512, 512] + - [16, 10853.0] + - - [2048, 3479, 1, 512, 2048, 2048, 512, 512] + - [49, 10559.0] + - - [2048, 3480, 1, 512, 2048, 2048, 512, 512] + - [16, 10895.0] + - - [2048, 3481, 1, 512, 2048, 2048, 512, 512] + - [16, 10578.0] + - - [2048, 3483, 1, 512, 2048, 2048, 512, 512] + - [16, 10541.0] + - - [2048, 3484, 1, 512, 2048, 2048, 512, 512] + - [38, 10653.0] + - - [2048, 3487, 1, 512, 2048, 2048, 512, 512] + - [16, 10623.0] + - - [2048, 3489, 1, 512, 2048, 2048, 512, 512] + - [16, 11022.0] + - - [2048, 3490, 1, 512, 2048, 2048, 512, 512] + - [5, 10588.0] + - - [2048, 3491, 1, 512, 2048, 2048, 512, 512] + - [16, 10698.0] + - - [2048, 3493, 1, 512, 2048, 2048, 512, 512] + - [16, 10935.0] + - - [2048, 3494, 1, 512, 2048, 2048, 512, 512] + - [16, 10794.0] + - - [2048, 3495, 1, 512, 2048, 2048, 512, 512] + - [58, 10078.0] + - - [2048, 3497, 1, 512, 2048, 2048, 512, 512] + - [27, 10453.0] + - - [2048, 3498, 1, 512, 2048, 2048, 512, 512] + - [16, 10877.0] + - - [2048, 3499, 1, 512, 2048, 2048, 512, 512] + - [16, 10700.0] + - - [2048, 3501, 1, 512, 2048, 2048, 512, 512] + - [16, 10667.0] + - - [2048, 3503, 1, 512, 2048, 2048, 512, 512] + - [58, 10338.0] + - - [2048, 3507, 1, 512, 2048, 2048, 512, 512] + - [16, 10667.0] + - - [2048, 3508, 1, 512, 2048, 2048, 512, 512] + - [16, 10880.0] + - - [2048, 3509, 1, 512, 2048, 2048, 512, 512] + - [16, 10705.0] + - - [2048, 3511, 1, 512, 2048, 2048, 512, 512] + - [16, 10764.0] + - - [2048, 3514, 1, 512, 2048, 2048, 512, 512] + - [38, 10620.0] + - - [2048, 3515, 1, 512, 2048, 2048, 512, 512] + - [27, 10812.0] + - - [2048, 3517, 1, 512, 2048, 2048, 512, 512] + - [16, 10980.0] + - - [2048, 3518, 1, 512, 2048, 2048, 512, 512] + - [27, 10706.0] + - - [2048, 3519, 1, 512, 2048, 2048, 512, 512] + - [16, 10979.0] + - - [2048, 3520, 1, 512, 2048, 2048, 512, 512] + - [16, 10753.0] + - - [2048, 3523, 1, 512, 2048, 2048, 512, 512] + - [5, 10780.0] + - - [2048, 3528, 1, 512, 2048, 2048, 512, 512] + - [38, 10384.0] + - - [2048, 3529, 1, 512, 2048, 2048, 512, 512] + - [16, 10821.0] + - - [2048, 3530, 1, 512, 2048, 2048, 512, 512] + - [16, 11084.0] + - - [2048, 3532, 1, 512, 2048, 2048, 512, 512] + - [16, 10674.0] + - - [2048, 3533, 1, 512, 2048, 2048, 512, 512] + - [16, 10772.0] + - - [2048, 3534, 1, 512, 2048, 2048, 512, 512] + - [27, 10724.0] + - - [2048, 3538, 1, 512, 2048, 2048, 512, 512] + - [27, 10920.0] + - - [2048, 3539, 1, 512, 2048, 2048, 512, 512] + - [49, 10656.0] + - - [2048, 3541, 1, 512, 2048, 2048, 512, 512] + - [27, 10749.0] + - - [2048, 3547, 1, 512, 2048, 2048, 512, 512] + - [16, 11118.0] + - - [2048, 3548, 1, 512, 2048, 2048, 512, 512] + - [27, 10966.0] + - - [2048, 3552, 1, 512, 2048, 2048, 512, 512] + - [16, 10801.0] + - - [2048, 3564, 1, 512, 2048, 2048, 512, 512] + - [58, 10505.0] + - - [2048, 3575, 1, 512, 2048, 2048, 512, 512] + - [16, 10824.0] + - - [2048, 3598, 1, 512, 2048, 2048, 512, 512] + - [58, 10218.0] + - - [2048, 3599, 1, 512, 2048, 2048, 512, 512] + - [16, 10946.0] + - - [2048, 3608, 1, 512, 2048, 2048, 512, 512] + - [16, 10934.0] + - - [2048, 3780, 1, 512, 2048, 2048, 512, 512] + - [16, 10944.0] + - - [2048, 3796, 1, 512, 2048, 2048, 512, 512] + - [27, 11261.0] + - - [2048, 3822, 1, 512, 2048, 2048, 512, 512] + - [16, 11218.0] + - - [2048, 3840, 1, 512, 2048, 2048, 512, 512] + - [27, 11283.0] + - - [2048, 3859, 1, 512, 2048, 2048, 512, 512] + - [49, 10180.0] + - - [2048, 3870, 1, 512, 2048, 2048, 512, 512] + - [27, 11059.0] + - - [2048, 3876, 1, 512, 2048, 2048, 512, 512] + - [58, 10385.0] + - - [2048, 3906, 1, 512, 2048, 2048, 512, 512] + - [16, 10926.0] + - - [2048, 3910, 1, 512, 2048, 2048, 512, 512] + - [16, 11144.0] + - - [2048, 3925, 1, 512, 2048, 2048, 512, 512] + - [27, 10975.0] + - - [2048, 3942, 1, 512, 2048, 2048, 512, 512] + - [16, 11262.0] + - - [2048, 3944, 1, 512, 2048, 2048, 512, 512] + - [27, 11069.0] + - - [2048, 3955, 1, 512, 2048, 2048, 512, 512] + - [27, 11164.0] + - - [2048, 3968, 1, 512, 2048, 2048, 512, 512] + - [27, 11034.0] + - - [2048, 3969, 1, 512, 2048, 2048, 512, 512] + - [18, 10813.0] + - - [2048, 3976, 1, 512, 2048, 2048, 512, 512] + - [49, 10571.0] + - - [2048, 3977, 1, 512, 2048, 2048, 512, 512] + - [16, 10752.0] + - - [2048, 3978, 1, 512, 2048, 2048, 512, 512] + - [58, 10313.0] + - - [2048, 3990, 1, 512, 2048, 2048, 512, 512] + - [18, 10861.0] + - - [2048, 3995, 1, 512, 2048, 2048, 512, 512] + - [58, 10299.0] + - - [2048, 3996, 1, 512, 2048, 2048, 512, 512] + - [18, 10788.0] + - - [2048, 4005, 1, 512, 2048, 2048, 512, 512] + - [58, 10580.0] + - - [2048, 4012, 1, 512, 2048, 2048, 512, 512] + - [18, 10843.0] + - - [2048, 4020, 1, 512, 2048, 2048, 512, 512] + - [16, 11073.0] + - - [2048, 4026, 1, 512, 2048, 2048, 512, 512] + - [16, 10908.0] + - - [2048, 4030, 1, 512, 2048, 2048, 512, 512] + - [16, 11150.0] + - - [2048, 4032, 1, 512, 2048, 2048, 512, 512] + - [18, 10926.0] + - - [33708, 184, 1, 512, 33708, 33708, 512, 512] + - [37, 9055.0] + - - [33708, 208, 1, 512, 33708, 33708, 512, 512] + - [16, 9216.0] + - - [33708, 246, 1, 512, 33708, 33708, 512, 512] + - [16, 10777.0] + - - [33708, 264, 1, 512, 33708, 33708, 512, 512] + - [58, 8033.0] + - - [33708, 465, 1, 512, 33708, 33708, 512, 512] + - [16, 10727.0] + - - [33708, 468, 1, 512, 33708, 33708, 512, 512] + - [38, 10763.0] + - - [33708, 493, 1, 512, 33708, 33708, 512, 512] + - [38, 11325.0] + - - [33708, 540, 1, 512, 33708, 33708, 512, 512] + - [15, 9079.0] + - - [33708, 550, 1, 512, 33708, 33708, 512, 512] + - [38, 9837.0] + - - [33708, 560, 1, 512, 33708, 33708, 512, 512] + - [16, 10288.0] + - - [33708, 644, 1, 512, 33708, 33708, 512, 512] + - [58, 9937.0] + - - [33708, 714, 1, 512, 33708, 33708, 512, 512] + - [16, 11061.0] + - - [33708, 720, 1, 512, 33708, 33708, 512, 512] + - [58, 10914.0] + - - [33708, 781, 1, 512, 33708, 33708, 512, 512] + - [52, 7841.0] + - - [33708, 936, 1, 512, 33708, 33708, 512, 512] + - [43, 7579.0] + - - [33708, 980, 1, 512, 33708, 33708, 512, 512] + - [43, 7902.0] + - - [33708, 1232, 1, 512, 33708, 33708, 512, 512] + - [40, 9807.0] + - - [33708, 1290, 1, 512, 33708, 33708, 512, 512] + - [54, 8137.0] + - - [33708, 1350, 1, 512, 33708, 33708, 512, 512] + - [38, 10468.0] + - - [33708, 1424, 1, 512, 33708, 33708, 512, 512] + - [16, 7555.0] + - - [33708, 1458, 1, 512, 33708, 33708, 512, 512] + - [51, 10630.0] + - - [33708, 1462, 1, 512, 33708, 33708, 512, 512] + - [38, 10440.0] + - - [33708, 1520, 1, 512, 33708, 33708, 512, 512] + - [50, 8997.0] + - - [33708, 1596, 1, 512, 33708, 33708, 512, 512] + - [59, 9780.0] + - - [33708, 1599, 1, 512, 33708, 33708, 512, 512] + - [46, 8386.0] + - - [33708, 1615, 1, 512, 33708, 33708, 512, 512] + - [16, 7773.0] + - - [33708, 1680, 1, 512, 33708, 33708, 512, 512] + - [36, 9086.0] + - - [33708, 1917, 1, 512, 33708, 33708, 512, 512] + - [38, 8311.0] + - - [33708, 2205, 1, 512, 33708, 33708, 512, 512] + - [18, 10580.0] + - - [33708, 2418, 1, 512, 33708, 33708, 512, 512] + - [18, 10346.0] + - - [33708, 3776, 1, 512, 33708, 33708, 512, 512] + - [58, 9915.0] + - - [33708, 3780, 1, 512, 33708, 33708, 512, 512] + - [18, 9910.0] + - - [33708, 3796, 1, 512, 33708, 33708, 512, 512] + - [18, 9738.0] + - - [33708, 3822, 1, 512, 33708, 33708, 512, 512] + - [16, 9976.0] + - - [33708, 3835, 1, 512, 33708, 33708, 512, 512] + - [14, 10890.0] + - - [33708, 3840, 1, 512, 33708, 33708, 512, 512] + - [60, 11055.0] + - - [33708, 3859, 1, 512, 33708, 33708, 512, 512] + - [16, 9771.0] + - - [33708, 3870, 1, 512, 33708, 33708, 512, 512] + - [16, 9649.0] + - - [33708, 3876, 1, 512, 33708, 33708, 512, 512] + - [38, 9907.0] + - - [33708, 3906, 1, 512, 33708, 33708, 512, 512] + - [5, 10017.0] + - - [33708, 3910, 1, 512, 33708, 33708, 512, 512] + - [59, 10090.0] + - - [33708, 3925, 1, 512, 33708, 33708, 512, 512] + - [46, 10100.0] + - - [33708, 3942, 1, 512, 33708, 33708, 512, 512] + - [16, 9807.0] + - - [33708, 3944, 1, 512, 33708, 33708, 512, 512] + - [38, 11735.0] + - - [33708, 3955, 1, 512, 33708, 33708, 512, 512] + - [38, 9917.0] + - - [33708, 3968, 1, 512, 33708, 33708, 512, 512] + - [16, 10232.0] + - - [33708, 3976, 1, 512, 33708, 33708, 512, 512] + - [59, 9916.0] + - - [33708, 3977, 1, 512, 33708, 33708, 512, 512] + - [60, 10436.0] + - - [33708, 3978, 1, 512, 33708, 33708, 512, 512] + - [12, 9920.0] + - - [33708, 3990, 1, 512, 33708, 33708, 512, 512] + - [58, 9772.0] + - - [33708, 3996, 1, 512, 33708, 33708, 512, 512] + - [16, 9869.0] + - - [33708, 3999, 1, 512, 33708, 33708, 512, 512] + - [36, 10003.0] + - - [33708, 4005, 1, 512, 33708, 33708, 512, 512] + - [16, 9958.0] + - - [33708, 4012, 1, 512, 33708, 33708, 512, 512] + - [18, 9910.0] + - - [33708, 4020, 1, 512, 33708, 33708, 512, 512] + - [14, 10253.0] + - - [33708, 4026, 1, 512, 33708, 33708, 512, 512] + - [16, 10056.0] + - - [33708, 4030, 1, 512, 33708, 33708, 512, 512] + - [18, 9938.0] + - - [33708, 4032, 1, 512, 33708, 33708, 512, 512] + - [39, 9856.0] + - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] + - [38, 10429.0] + - - [511, 8192, 1, 8192, 511, 511, 8192, 8192] + - [38, 8551.0] + - - [4096, 4096, 1, 4096, 4096, 4096, 4096, 4096] + - [49, 10556.0] + - - [8192, 8193, 1, 8192, 8192, 8192, 8192, 8192] + - [18, 10576.0] + - - [3072, 3072, 1, 3071, 3072, 3072, 3071, 3071] + - [2, 10849.0] + - - [8192, 8192, 1, 8193, 8192, 8192, 8193, 8193] + - [40, 12143.0] + - - [7681, 8192, 1, 8192, 7681, 7681, 8192, 8192] + - [18, 10636.0] + - - [7680, 8192, 1, 8193, 7680, 7680, 8193, 8193] + - [29, 12385.0] + - - [513, 4096, 1, 4096, 513, 513, 4096, 4096] + - [60, 7838.0] + - - [3073, 512, 1, 3072, 3073, 3073, 3072, 3072] + - [16, 10299.0] + - - [7680, 8192, 1, 8192, 7680, 7680, 8192, 8192] + - [60, 10815.0] + - - [4096, 4096, 1, 4097, 4096, 4096, 4097, 4097] + - [18, 11451.0] + - - [8192, 8191, 1, 8192, 8192, 8192, 8192, 8192] + - [18, 10881.0] + - - [8192, 512, 1, 8193, 8192, 8192, 8193, 8193] + - [29, 8628.0] + - - [2880, 3071, 1, 3072, 2880, 2880, 3072, 3072] + - [59, 9898.0] + - - [2880, 3072, 1, 3072, 2880, 2880, 3072, 3072] + - [42, 8917.0] + - - [4096, 511, 1, 4096, 4096, 4096, 4096, 4096] + - [18, 8725.0] + - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] + - [58, 10317.0] + - - [512, 8191, 1, 8192, 512, 512, 8192, 8192] + - [40, 8498.0] + - - [4096, 4095, 1, 4096, 4096, 4096, 4096, 4096] + - [15, 9233.0] + - - [8192, 511, 1, 8192, 8192, 8192, 8192, 8192] + - [49, 7805.0] + - - [8192, 512, 1, 8192, 8192, 8192, 8192, 8192] + - [40, 6145.0] + - - [511, 3072, 1, 3072, 511, 511, 3072, 3072] + - [58, 10280.0] + - - [7680, 8193, 1, 8192, 7680, 7680, 8192, 8192] + - [60, 10598.0] + - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 11301.0] + - - [3072, 512, 1, 3073, 3072, 3072, 3073, 3073] + - [45, 10925.0] + - - [513, 8192, 1, 8192, 513, 513, 8192, 8192] + - [38, 6579.0] + - - [7679, 8192, 1, 8192, 7679, 7679, 8192, 8192] + - [18, 10663.0] + - - [3840, 4096, 1, 4097, 3840, 3840, 4097, 4097] + - [29, 11503.0] + - - [512, 3072, 1, 3071, 512, 512, 3071, 3071] + - [45, 10905.0] + - - [7680, 8192, 1, 8191, 7680, 7680, 8191, 8191] + - [40, 12033.0] + - - [3072, 511, 1, 3072, 3072, 3072, 3072, 3072] + - [38, 10319.0] + - - [8193, 8192, 1, 8192, 8193, 8193, 8192, 8192] + - [18, 10808.0] + - - [512, 4096, 1, 4095, 512, 512, 4095, 4095] + - [49, 10083.0] + - - [512, 3071, 1, 3072, 512, 512, 3072, 3072] + - [58, 10162.0] + - - [3073, 3072, 1, 3072, 3073, 3073, 3072, 3072] + - [62, 7649.0] + - - [512, 3073, 1, 3072, 512, 512, 3072, 3072] + - [16, 9896.0] + - - [4096, 4096, 1, 4095, 4096, 4096, 4095, 4095] + - [1, 11660.0] + - - [1920, 2048, 1, 2047, 1920, 1920, 2047, 2047] + - [23, 11998.0] + - - [1920, 2049, 1, 2048, 1920, 1920, 2048, 2048] + - [38, 10468.0] + - - [512, 8192, 1, 8191, 512, 512, 8191, 8191] + - [60, 10297.0] + - - [3840, 4096, 1, 4096, 3840, 3840, 4096, 4096] + - [58, 9594.0] + - - [8191, 512, 1, 8192, 8191, 8191, 8192, 8192] + - [18, 6101.0] + - - [2881, 3072, 1, 3072, 2881, 2881, 3072, 3072] + - [42, 8071.0] + - - [512, 4096, 1, 4096, 512, 512, 4096, 4096] + - [42, 8543.0] + - - [3841, 4096, 1, 4096, 3841, 3841, 4096, 4096] + - [38, 8830.0] + - - [2880, 3072, 1, 3073, 2880, 2880, 3073, 3073] + - [51, 10682.0] + - - [4095, 512, 1, 4096, 4095, 4095, 4096, 4096] + - [18, 8864.0] + - - [1919, 2048, 1, 2048, 1919, 1919, 2048, 2048] + - [16, 11457.0] + - - [1920, 2048, 1, 2048, 1920, 1920, 2048, 2048] + - [38, 11461.0] + - - [8192, 8192, 1, 8192, 8192, 8192, 8192, 8192] + - [18, 10783.0] + - - [511, 4096, 1, 4096, 511, 511, 4096, 4096] + - [58, 8376.0] + - - [8192, 513, 1, 8192, 8192, 8192, 8192, 8192] + - [62, 6255.0] + - - [513, 3072, 1, 3072, 513, 513, 3072, 3072] + - [18, 9167.0] + - - [7680, 8191, 1, 8192, 7680, 7680, 8192, 8192] + - [18, 10635.0] + - - [512, 4097, 1, 4096, 512, 512, 4096, 4096] + - [62, 8286.0] + - - [2047, 2048, 1, 2048, 2047, 2047, 2048, 2048] + - [16, 11308.0] + - - [2049, 2048, 1, 2048, 2049, 2049, 2048, 2048] + - [18, 10751.0] + - - [3840, 4095, 1, 4096, 3840, 3840, 4096, 4096] + - [20, 9678.0] + - - [2880, 3072, 1, 3071, 2880, 2880, 3071, 3071] + - [1, 11517.0] + - - [3072, 3072, 1, 3073, 3072, 3072, 3073, 3073] + - [29, 10630.0] + - - [2880, 3073, 1, 3072, 2880, 2880, 3072, 3072] + - [56, 8018.0] + - - [4096, 513, 1, 4096, 4096, 4096, 4096, 4096] + - [20, 7451.0] + - - [4097, 512, 1, 4096, 4097, 4097, 4096, 4096] + - [18, 8364.0] + - - [8192, 512, 1, 8191, 8192, 8192, 8191, 8191] + - [31, 9821.0] + - - [1921, 2048, 1, 2048, 1921, 1921, 2048, 2048] + - [16, 10604.0] + - - [512, 3072, 1, 3073, 512, 512, 3073, 3073] + - [45, 10861.0] + - - [2048, 2049, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 10553.0] + - - [3072, 512, 1, 3071, 3072, 3072, 3071, 3071] + - [45, 11058.0] + - - [3071, 3072, 1, 3072, 3071, 3071, 3072, 3072] + - [37, 9022.0] + - - [3840, 4097, 1, 4096, 3840, 3840, 4096, 4096] + - [42, 9635.0] + - - [2048, 2047, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 11457.0] + - - [2879, 3072, 1, 3072, 2879, 2879, 3072, 3072] + - [62, 8463.0] + - - [3072, 513, 1, 3072, 3072, 3072, 3072, 3072] + - [38, 8940.0] + - - [512, 4095, 1, 4096, 512, 512, 4096, 4096] + - [59, 8737.0] + - - [3071, 512, 1, 3072, 3071, 3071, 3072, 3072] + - [38, 9992.0] + - - [4096, 512, 1, 4096, 4096, 4096, 4096, 4096] + - [18, 8860.0] + - - [4097, 4096, 1, 4096, 4097, 4097, 4096, 4096] + - [60, 8510.0] + - - [2048, 2048, 1, 2047, 2048, 2048, 2047, 2047] + - [23, 11675.0] + - - [3839, 4096, 1, 4096, 3839, 3839, 4096, 4096] + - [42, 9724.0] + - - [512, 4096, 1, 4097, 512, 512, 4097, 4097] + - [27, 10064.0] + - - [3072, 3073, 1, 3072, 3072, 3072, 3072, 3072] + - [43, 8578.0] + - - [2048, 2048, 1, 2049, 2048, 2048, 2049, 2049] + - [23, 11518.0] + - - [8191, 8192, 1, 8192, 8191, 8191, 8192, 8192] + - [18, 10822.0] + - - [3072, 3071, 1, 3072, 3072, 3072, 3072, 3072] + - [59, 9954.0] + - - [4096, 512, 1, 4097, 4096, 4096, 4097, 4097] + - [5, 10049.0] + - - [3840, 4096, 1, 4095, 3840, 3840, 4095, 4095] + - [7, 11400.0] + - - [1920, 2047, 1, 2048, 1920, 1920, 2048, 2048] + - [38, 11367.0] + - - [8192, 8192, 1, 8191, 8192, 8192, 8191, 8191] + - [29, 12105.0] + - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] + - [62, 7850.0] + - - [512, 8193, 1, 8192, 512, 512, 8192, 8192] + - [42, 7275.0] + - - [4096, 512, 1, 4095, 4096, 4096, 4095, 4095] + - [5, 10018.0] + - - [8193, 512, 1, 8192, 8193, 8193, 8192, 8192] + - [62, 8433.0] + - - [4095, 4096, 1, 4096, 4095, 4095, 4096, 4096] + - [20, 8328.0] + - - [4096, 4097, 1, 4096, 4096, 4096, 4096, 4096] + - [40, 9110.0] + - - [512, 8192, 1, 8192, 512, 512, 8192, 8192] + - [58, 8920.0] + - - [512, 8192, 1, 8193, 512, 512, 8193, 8193] + - [18, 8674.0] + - - [1920, 2048, 1, 2049, 1920, 1920, 2049, 2049] + - [45, 11804.0] + - - [479, 3072, 1, 3072, 479, 479, 3072, 3072] + - [38, 9746.0] + - - [479, 4096, 1, 4096, 479, 479, 4096, 4096] + - [59, 8507.0] + - - [479, 8192, 1, 8192, 479, 479, 8192, 8192] + - [62, 6727.0] + - - [480, 3072, 1, 3071, 480, 480, 3071, 3071] + - [23, 10217.0] + - - [480, 3072, 1, 3073, 480, 480, 3073, 3073] + - [23, 10157.0] + - - [480, 3073, 1, 3072, 480, 480, 3072, 3072] + - [38, 9882.0] + - - [480, 4095, 1, 4096, 480, 480, 4096, 4096] + - [59, 8584.0] + - - [480, 4096, 1, 4095, 480, 480, 4095, 4095] + - [49, 9599.0] + - - [480, 4096, 1, 4097, 480, 480, 4097, 4097] + - [27, 9620.0] + - - [480, 4097, 1, 4096, 480, 480, 4096, 4096] + - [58, 9151.0] + - - [480, 8191, 1, 8192, 480, 480, 8192, 8192] + - [52, 8078.0] + - - [480, 8192, 1, 8191, 480, 480, 8191, 8191] + - [32, 8454.0] + - - [480, 8192, 1, 8193, 480, 480, 8193, 8193] + - [58, 9709.0] + - - [480, 8193, 1, 8192, 480, 480, 8192, 8192] + - [42, 6894.0] + - - [481, 3072, 1, 3072, 481, 481, 3072, 3072] + - [58, 9708.0] + - - [481, 4096, 1, 4096, 481, 481, 4096, 4096] + - [58, 8697.0] + - - [481, 8192, 1, 8192, 481, 481, 8192, 8192] + - [62, 6735.0] + - - [3072, 479, 1, 3072, 3072, 3072, 3072, 3072] + - [58, 9641.0] + - - [3072, 480, 1, 3071, 3072, 3072, 3071, 3071] + - [23, 10210.0] + - - [3072, 480, 1, 3073, 3072, 3072, 3073, 3073] + - [23, 10329.0] + - - [3072, 481, 1, 3072, 3072, 3072, 3072, 3072] + - [58, 10005.0] + - - [3073, 480, 1, 3072, 3073, 3073, 3072, 3072] + - [16, 9992.0] + - - [480, 3072, 1, 3072, 480, 480, 3072, 3072] + - [58, 9959.0] + - - [480, 4096, 1, 4096, 480, 480, 4096, 4096] + - [59, 8547.0] + - - [480, 8192, 1, 8192, 480, 480, 8192, 8192] + - [49, 7109.0] + - - [3072, 480, 1, 3072, 3072, 3072, 3072, 3072] + - [38, 9740.0] + - - [4096, 480, 1, 4096, 4096, 4096, 4096, 4096] + - [38, 8100.0] + - - [8192, 480, 1, 8192, 8192, 8192, 8192, 8192] + - [63, 5877.0] + - - [1024, 3840, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 10994.0] + - - [1024, 3840, 1, 4096, 1024, 1024, 4096, 4096] + - [58, 8775.0] + - - [1024, 3968, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10472.0] + - - [1024, 3968, 1, 4096, 1024, 1024, 4096, 4096] + - [5, 8148.0] + - - [1024, 7200, 1, 1024, 1024, 1024, 1024, 1024] + - [58, 10796.0] + - - [1024, 7200, 1, 4096, 1024, 1024, 4096, 4096] + - [16, 9475.0] + - - [1024, 8160, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 11297.0] + - - [1024, 8160, 1, 4096, 1024, 1024, 4096, 4096] + - [62, 9352.0] + - - [1024, 9520, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10365.0] + - - [1024, 9520, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8375.0] + - - [1024, 10200, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10773.0] + - - [1024, 10200, 1, 4096, 1024, 1024, 4096, 4096] + - [57, 9437.0] + - - [4096, 3840, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 8828.0] + - - [4096, 3968, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9189.0] + - - [4096, 7200, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 8877.0] + - - [4096, 8160, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9365.0] + - - [4096, 9520, 1, 1024, 4096, 4096, 1024, 1024] + - [40, 9660.0] + - - [4096, 10200, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10127.0] + - - [42720, 3968, 1, 1024, 42720, 42720, 1024, 1024] + - [60, 11194.0] + - - [42720, 7200, 1, 1024, 42720, 42720, 1024, 1024] + - [60, 11450.0] + - - [42720, 9520, 1, 1024, 42720, 42720, 1024, 1024] + - [60, 11709.0] + - - [2048, 960, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 9341.0] + - - [2048, 960, 1, 74, 2048, 2048, 74, 74] + - [24, 6239.0] + - - [1600, 1024, 1, 960, 1600, 1600, 960, 960] + - [45, 9573.0] + - - [2048, 2048, 1, 960, 2048, 2048, 960, 960] + - [23, 11677.0] + - - [4096, 1024, 1, 257, 4096, 4096, 257, 257] + - [27, 10323.0] + - - [10240, 8976, 1, 256, 10240, 10240, 256, 256] + - [16, 10954.0] + - - [1024, 1600, 1, 1024, 1024, 1024, 1024, 1024] + - [15, 8609.0] + - - [1024, 1600, 1, 560, 1024, 1024, 560, 560] + - [44, 9876.0] + - - [10496, 8976, 1, 256, 10496, 10496, 256, 256] + - [38, 10961.0] + - - [11264, 8976, 1, 256, 11264, 11264, 256, 256] + - [16, 11031.0] + - - [11776, 8976, 1, 256, 11776, 11776, 256, 256] + - [7, 10127.0] + - - [12544, 8976, 1, 256, 12544, 12544, 256, 256] + - [58, 10627.0] + - - [1280, 8976, 1, 256, 1280, 1280, 256, 256] + - [50, 9643.0] + - - [13312, 8976, 1, 256, 13312, 13312, 256, 256] + - [18, 10518.0] + - - [13568, 8976, 1, 256, 13568, 13568, 256, 256] + - [58, 10695.0] + - - [13824, 8976, 1, 256, 13824, 13824, 256, 256] + - [59, 10120.0] + - - [15104, 8976, 1, 256, 15104, 15104, 256, 256] + - [38, 10780.0] + - - [15360, 8976, 1, 256, 15360, 15360, 256, 256] + - [5, 8169.0] + - - [15872, 8976, 1, 256, 15872, 15872, 256, 256] + - [3, 10361.0] + - - [16128, 8976, 1, 256, 16128, 16128, 256, 256] + - [3, 10484.0] + - - [17152, 8976, 1, 256, 17152, 17152, 256, 256] + - [58, 8881.0] + - - [1792, 8976, 1, 256, 1792, 1792, 256, 256] + - [38, 10085.0] + - - [18176, 8976, 1, 256, 18176, 18176, 256, 256] + - [23, 10179.0] + - - [18688, 8976, 1, 256, 18688, 18688, 256, 256] + - [60, 9253.0] + - - [18944, 8976, 1, 256, 18944, 18944, 256, 256] + - [16, 9011.0] + - - [19712, 8976, 1, 256, 19712, 19712, 256, 256] + - [60, 11362.0] + - - [19968, 8976, 1, 256, 19968, 19968, 256, 256] + - [16, 9052.0] + - - [20480, 8976, 1, 256, 20480, 20480, 256, 256] + - [18, 9472.0] + - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] + - [14, 9461.0] + - - [2048, 1536, 1, 768, 2048, 2048, 768, 768] + - [40, 10833.0] + - - [2048, 684, 1, 512, 2048, 2048, 512, 512] + - [58, 8412.0] + - - [2048, 684, 1, 768, 2048, 2048, 768, 768] + - [15, 8430.0] + - - [2048, 8976, 1, 256, 2048, 2048, 256, 256] + - [27, 11331.0] + - - [20992, 8976, 1, 256, 20992, 20992, 256, 256] + - [16, 9149.0] + - - [21248, 8976, 1, 256, 21248, 21248, 256, 256] + - [45, 10024.0] + - - [2304, 8976, 1, 256, 2304, 2304, 256, 256] + - [58, 11494.0] + - - [23552, 8976, 1, 256, 23552, 23552, 256, 256] + - [5, 9478.0] + - - [2560, 8976, 1, 256, 2560, 2560, 256, 256] + - [38, 11544.0] + - - [256, 10496, 1, 1024, 256, 256, 1024, 1024] + - [58, 10389.0] + - - [256, 11264, 1, 1024, 256, 256, 1024, 1024] + - [16, 10869.0] + - - [256, 11520, 1, 1024, 256, 256, 1024, 1024] + - [58, 11026.0] + - - [256, 11776, 1, 1024, 256, 256, 1024, 1024] + - [40, 10608.0] + - - [256, 12544, 1, 1024, 256, 256, 1024, 1024] + - [40, 11080.0] + - - [256, 13312, 1, 1024, 256, 256, 1024, 1024] + - [58, 10868.0] + - - [256, 14336, 1, 1024, 256, 256, 1024, 1024] + - [16, 10818.0] + - - [256, 14592, 1, 1024, 256, 256, 1024, 1024] + - [60, 11158.0] + - - [256, 14848, 1, 1024, 256, 256, 1024, 1024] + - [40, 11214.0] + - - [256, 15104, 1, 1024, 256, 256, 1024, 1024] + - [58, 11299.0] + - - [256, 16128, 1, 1024, 256, 256, 1024, 1024] + - [58, 10038.0] + - - [256, 18176, 1, 1024, 256, 256, 1024, 1024] + - [58, 11159.0] + - - [256, 18944, 1, 1024, 256, 256, 1024, 1024] + - [58, 11476.0] + - - [256, 19200, 1, 1024, 256, 256, 1024, 1024] + - [58, 11409.0] + - - [256, 20480, 1, 1024, 256, 256, 1024, 1024] + - [16, 11550.0] + - - [256, 20992, 1, 1024, 256, 256, 1024, 1024] + - [38, 4325.0] + - - [256, 21248, 1, 1024, 256, 256, 1024, 1024] + - [37, 4127.0] + - - [256, 21504, 1, 1024, 256, 256, 1024, 1024] + - [37, 4083.0] + - - [256, 22016, 1, 1024, 256, 256, 1024, 1024] + - [38, 10927.0] + - - [256, 22344, 1, 1024, 256, 256, 1024, 1024] + - [16, 11093.0] + - - [256, 23296, 1, 1024, 256, 256, 1024, 1024] + - [58, 11286.0] + - - [256, 23552, 1, 1024, 256, 256, 1024, 1024] + - [58, 11336.0] + - - [256, 31488, 1, 1024, 256, 256, 1024, 1024] + - [38, 11764.0] + - - [256, 33536, 1, 1024, 256, 256, 1024, 1024] + - [60, 11469.0] + - - [256, 44505, 1, 1024, 256, 256, 1024, 1024] + - [49, 7782.0] + - - [256, 4608, 1, 1024, 256, 256, 1024, 1024] + - [18, 8931.0] + - - [256, 4864, 1, 1024, 256, 256, 1024, 1024] + - [60, 10392.0] + - - [256, 5376, 1, 1024, 256, 256, 1024, 1024] + - [58, 9025.0] + - - [256, 5888, 1, 1024, 256, 256, 1024, 1024] + - [58, 9854.0] + - - [256, 6144, 1, 1024, 256, 256, 1024, 1024] + - [38, 10273.0] + - - [256, 6400, 1, 1024, 256, 256, 1024, 1024] + - [16, 10471.0] + - - [256, 6656, 1, 1024, 256, 256, 1024, 1024] + - [40, 9813.0] + - - [256, 7168, 1, 1024, 256, 256, 1024, 1024] + - [40, 10507.0] + - - [256, 7424, 1, 1024, 256, 256, 1024, 1024] + - [60, 10738.0] + - - [256, 7936, 1, 1024, 256, 256, 1024, 1024] + - [58, 9764.0] + - - [256, 8192, 1, 1024, 256, 256, 1024, 1024] + - [58, 10104.0] + - - [256, 8448, 1, 1024, 256, 256, 1024, 1024] + - [16, 10388.0] + - - [256, 8960, 1, 1024, 256, 256, 1024, 1024] + - [16, 10765.0] + - - [256, 9984, 1, 1024, 256, 256, 1024, 1024] + - [40, 11103.0] + - - [2816, 8976, 1, 256, 2816, 2816, 256, 256] + - [16, 11562.0] + - - [28672, 8976, 1, 256, 28672, 28672, 256, 256] + - [18, 9518.0] + - - [3072, 8976, 1, 256, 3072, 3072, 256, 256] + - [16, 11626.0] + - - [31488, 8976, 1, 256, 31488, 31488, 256, 256] + - [18, 10000.0] + - - [3328, 8976, 1, 256, 3328, 3328, 256, 256] + - [16, 11652.0] + - - [33536, 8976, 1, 256, 33536, 33536, 256, 256] + - [34, 10489.0] + - - [3840, 8976, 1, 256, 3840, 3840, 256, 256] + - [27, 11686.0] + - - [4096, 8976, 1, 256, 4096, 4096, 256, 256] + - [16, 11736.0] + - - [4352, 8976, 1, 256, 4352, 4352, 256, 256] + - [16, 11766.0] + - - [44505, 8976, 1, 256, 44505, 44505, 256, 256] + - [59, 10392.0] + - - [4608, 8976, 1, 256, 4608, 4608, 256, 256] + - [18, 11578.0] + - - [4864, 8976, 1, 256, 4864, 4864, 256, 256] + - [58, 11645.0] + - - [5120, 8976, 1, 256, 5120, 5120, 256, 256] + - [18, 10378.0] + - - [5376, 8976, 1, 256, 5376, 5376, 256, 256] + - [38, 11637.0] + - - [5632, 8976, 1, 256, 5632, 5632, 256, 256] + - [38, 11709.0] + - - [5888, 8976, 1, 256, 5888, 5888, 256, 256] + - [5, 11623.0] + - - [6144, 8976, 1, 256, 6144, 6144, 256, 256] + - [16, 11680.0] + - - [6400, 8976, 1, 256, 6400, 6400, 256, 256] + - [16, 11764.0] + - - [684, 8976, 1, 256, 684, 684, 256, 256] + - [13, 8785.0] + - - [7168, 8976, 1, 256, 7168, 7168, 256, 256] + - [16, 10586.0] + - - [7936, 8976, 1, 256, 7936, 7936, 256, 256] + - [16, 11884.0] + - - [8192, 8976, 1, 256, 8192, 8192, 256, 256] + - [29, 8160.0] + - - [8448, 8976, 1, 256, 8448, 8448, 256, 256] + - [58, 10990.0] + - - [8960, 8976, 1, 256, 8960, 8960, 256, 256] + - [18, 10901.0] + - - [9472, 8976, 1, 256, 9472, 9472, 256, 256] + - [16, 10883.0] + - - [9728, 8976, 1, 256, 9728, 9728, 256, 256] + - [38, 10932.0] + - - [9984, 8976, 1, 256, 9984, 9984, 256, 256] + - [16, 11121.0] + - - [512, 32768, 1, 13, 512, 512, 13, 13] + - [39, 2906.0] + - - [256, 32768, 1, 512, 256, 256, 512, 512] + - [16, 11453.0] + - - [128, 32768, 1, 512, 128, 128, 512, 512] + - [58, 11334.0] + - - [1024, 32768, 1, 479, 1024, 1024, 479, 479] + - [7, 11731.0] + - - [1024, 32768, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 8199.0] + - - [512, 32768, 1, 1024, 512, 512, 1024, 1024] + - [38, 6431.0] + - - [1023, 2048, 1, 4096, 1023, 1023, 4096, 4096] + - [38, 10303.0] + - - [1025, 2048, 1, 4096, 1025, 1025, 4096, 4096] + - [40, 9400.0] + - - [1024, 2047, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 10068.0] + - - [1024, 2049, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 9323.0] + - - [1024, 2048, 1, 4095, 1024, 1024, 4095, 4095] + - [23, 10778.0] + - - [1024, 2048, 1, 4097, 1024, 1024, 4097, 4097] + - [45, 10676.0] + - - [1023, 3072, 1, 1024, 1023, 1023, 1024, 1024] + - [40, 10685.0] + - - [1025, 3072, 1, 1024, 1025, 1025, 1024, 1024] + - [16, 9843.0] + - - [1024, 3071, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 10813.0] + - - [1024, 3073, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10715.0] + - - [1024, 3072, 1, 1023, 1024, 1024, 1023, 1023] + - [51, 10942.0] + - - [1024, 3072, 1, 1025, 1024, 1024, 1025, 1025] + - [7, 11100.0] + - - [3071, 512, 1, 1024, 3071, 3071, 1024, 1024] + - [16, 10009.0] + - - [3073, 512, 1, 1024, 3073, 3073, 1024, 1024] + - [58, 9967.0] + - - [3072, 511, 1, 1024, 3072, 3072, 1024, 1024] + - [38, 9982.0] + - - [3072, 513, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 8690.0] + - - [3072, 512, 1, 1023, 3072, 3072, 1023, 1023] + - [23, 10646.0] + - - [3072, 512, 1, 1025, 3072, 3072, 1025, 1025] + - [58, 10052.0] + - - [128, 32768, 1, 256, 128, 128, 256, 256] + - [16, 9714.0] + - - [1024, 4096, 1, 480, 1024, 1024, 480, 480] + - [1, 11207.0] + - - [512, 4096, 1, 1024, 512, 512, 1024, 1024] + - [38, 10172.0] + - - [512, 55296, 1, 13, 512, 512, 13, 13] + - [7, 2486.0] + - - [256, 55296, 1, 512, 256, 256, 512, 512] + - [60, 11750.0] + - - [128, 55296, 1, 256, 128, 128, 256, 256] + - [38, 11270.0] + - - [1024, 6912, 1, 480, 1024, 1024, 480, 480] + - [1, 11975.0] + - - [1024, 6912, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11449.0] + - - [512, 6912, 1, 1024, 512, 512, 1024, 1024] + - [58, 11017.0] + - - [256, 6912, 1, 512, 256, 256, 512, 512] + - [60, 9489.0] + - - [1151, 1152, 1, 1152, 1151, 1151, 1152, 1152] + - [49, 8850.0] + - - [1153, 1152, 1, 1152, 1153, 1153, 1152, 1152] + - [38, 8886.0] + - - [1152, 1151, 1, 1152, 1152, 1152, 1152, 1152] + - [58, 8805.0] + - - [1152, 1153, 1, 1152, 1152, 1152, 1152, 1152] + - [27, 8856.0] + - - [1152, 1152, 1, 1151, 1152, 1152, 1151, 1151] + - [1, 9179.0] + - - [1152, 1152, 1, 1153, 1152, 1152, 1153, 1153] + - [44, 9268.0] + - - [1535, 1536, 1, 1536, 1535, 1535, 1536, 1536] + - [18, 10426.0] + - - [1537, 1536, 1, 1536, 1537, 1537, 1536, 1536] + - [60, 10417.0] + - - [1536, 1535, 1, 1536, 1536, 1536, 1536, 1536] + - [18, 10405.0] + - - [1536, 1537, 1, 1536, 1536, 1536, 1536, 1536] + - [58, 10115.0] + - - [1536, 1536, 1, 1535, 1536, 1536, 1535, 1535] + - [29, 10654.0] + - - [1536, 1536, 1, 1537, 1536, 1536, 1537, 1537] + - [29, 10652.0] + - - [1919, 1920, 1, 1920, 1919, 1919, 1920, 1920] + - [49, 10940.0] + - - [1921, 1920, 1, 1920, 1921, 1921, 1920, 1920] + - [5, 10932.0] + - - [1920, 1919, 1, 1920, 1920, 1920, 1920, 1920] + - [27, 10983.0] + - - [1920, 1921, 1, 1920, 1920, 1920, 1920, 1920] + - [58, 10905.0] + - - [1920, 1920, 1, 1919, 1920, 1920, 1919, 1919] + - [1, 11270.0] + - - [1920, 1920, 1, 1921, 1920, 1920, 1921, 1921] + - [45, 11396.0] + - - [2303, 2304, 1, 2304, 2303, 2303, 2304, 2304] + - [16, 11114.0] + - - [2305, 2304, 1, 2304, 2305, 2305, 2304, 2304] + - [60, 10823.0] + - - [2304, 2303, 1, 2304, 2304, 2304, 2304, 2304] + - [58, 11137.0] + - - [2304, 2305, 1, 2304, 2304, 2304, 2304, 2304] + - [60, 10794.0] + - - [2304, 2304, 1, 2303, 2304, 2304, 2303, 2303] + - [45, 11514.0] + - - [2304, 2304, 1, 2305, 2304, 2304, 2305, 2305] + - [23, 11504.0] + - - [2687, 2688, 1, 2688, 2687, 2687, 2688, 2688] + - [49, 11202.0] + - - [2689, 2688, 1, 2688, 2689, 2689, 2688, 2688] + - [27, 10710.0] + - - [2688, 2687, 1, 2688, 2688, 2688, 2688, 2688] + - [27, 11052.0] + - - [2688, 2689, 1, 2688, 2688, 2688, 2688, 2688] + - [49, 10746.0] + - - [2688, 2688, 1, 2687, 2688, 2688, 2687, 2687] + - [49, 11159.0] + - - [2688, 2688, 1, 2689, 2688, 2688, 2689, 2689] + - [49, 11176.0] + - - [3455, 3456, 1, 3456, 3455, 3455, 3456, 3456] + - [27, 10811.0] + - - [3457, 3456, 1, 3456, 3457, 3457, 3456, 3456] + - [27, 10542.0] + - - [3456, 3455, 1, 3456, 3456, 3456, 3456, 3456] + - [27, 11861.0] + - - [3456, 3457, 1, 3456, 3456, 3456, 3456, 3456] + - [27, 10392.0] + - - [3456, 3456, 1, 3455, 3456, 3456, 3455, 3455] + - [27, 10914.0] + - - [3456, 3456, 1, 3457, 3456, 3456, 3457, 3457] + - [18, 11825.0] + - - [3839, 3840, 1, 3840, 3839, 3839, 3840, 3840] + - [38, 10134.0] + - - [3841, 3840, 1, 3840, 3841, 3841, 3840, 3840] + - [29, 10870.0] + - - [3840, 3839, 1, 3840, 3840, 3840, 3840, 3840] + - [38, 10066.0] + - - [3840, 3841, 1, 3840, 3840, 3840, 3840, 3840] + - [59, 10178.0] + - - [3840, 3840, 1, 3839, 3840, 3840, 3839, 3839] + - [18, 11299.0] + - - [3840, 3840, 1, 3841, 3840, 3840, 3841, 3841] + - [27, 12041.0] + - - [4223, 4224, 1, 4224, 4223, 4223, 4224, 4224] + - [38, 11175.0] + - - [4225, 4224, 1, 4224, 4225, 4225, 4224, 4224] + - [27, 10940.0] + - - [4224, 4223, 1, 4224, 4224, 4224, 4224, 4224] + - [58, 11268.0] + - - [4224, 4225, 1, 4224, 4224, 4224, 4224, 4224] + - [47, 10884.0] + - - [4224, 4224, 1, 4223, 4224, 4224, 4223, 4223] + - [49, 11475.0] + - - [4224, 4224, 1, 4225, 4224, 4224, 4225, 4225] + - [27, 11450.0] + - - [4607, 4608, 1, 4608, 4607, 4607, 4608, 4608] + - [16, 10872.0] + - - [4609, 4608, 1, 4608, 4609, 4609, 4608, 4608] + - [16, 10621.0] + - - [4608, 4607, 1, 4608, 4608, 4608, 4608, 4608] + - [58, 11802.0] + - - [4608, 4609, 1, 4608, 4608, 4608, 4608, 4608] + - [58, 10664.0] + - - [4608, 4608, 1, 4607, 4608, 4608, 4607, 4607] + - [18, 11602.0] + - - [4608, 4608, 1, 4609, 4608, 4608, 4609, 4609] + - [18, 11623.0] + - - [4991, 4992, 1, 4992, 4991, 4991, 4992, 4992] + - [58, 11280.0] + - - [4993, 4992, 1, 4992, 4993, 4993, 4992, 4992] + - [40, 10953.0] + - - [4992, 4991, 1, 4992, 4992, 4992, 4992, 4992] + - [58, 11189.0] + - - [4992, 4993, 1, 4992, 4992, 4992, 4992, 4992] + - [58, 11043.0] + - - [4992, 4992, 1, 4991, 4992, 4992, 4991, 4991] + - [49, 12062.0] + - - [4992, 4992, 1, 4993, 4992, 4992, 4993, 4993] + - [25, 11510.0] + - - [5375, 5376, 1, 5376, 5375, 5375, 5376, 5376] + - [60, 11156.0] + - - [5377, 5376, 1, 5376, 5377, 5377, 5376, 5376] + - [38, 10992.0] + - - [5376, 5375, 1, 5376, 5376, 5376, 5376, 5376] + - [58, 12048.0] + - - [5376, 5377, 1, 5376, 5376, 5376, 5376, 5376] + - [16, 11420.0] + - - [5376, 5376, 1, 5375, 5376, 5376, 5375, 5375] + - [40, 11716.0] + - - [5376, 5376, 1, 5377, 5376, 5376, 5377, 5377] + - [40, 11658.0] + - - [5759, 5760, 1, 5760, 5759, 5759, 5760, 5760] + - [49, 11477.0] + - - [5761, 5760, 1, 5760, 5761, 5761, 5760, 5760] + - [40, 11206.0] + - - [5760, 5759, 1, 5760, 5760, 5760, 5760, 5760] + - [60, 11518.0] + - - [5760, 5761, 1, 5760, 5760, 5760, 5760, 5760] + - [40, 11368.0] + - - [5760, 5760, 1, 5759, 5760, 5760, 5759, 5759] + - [29, 11663.0] + - - [5760, 5760, 1, 5761, 5760, 5760, 5761, 5761] + - [40, 11625.0] + - - [6143, 6144, 1, 6144, 6143, 6143, 6144, 6144] + - [18, 10526.0] + - - [6145, 6144, 1, 6144, 6145, 6145, 6144, 6144] + - [18, 10619.0] + - - [6144, 6143, 1, 6144, 6144, 6144, 6144, 6144] + - [18, 10678.0] + - - [6144, 6145, 1, 6144, 6144, 6144, 6144, 6144] + - [18, 10249.0] + - - [6144, 6144, 1, 6143, 6144, 6144, 6143, 6143] + - [51, 11900.0] + - - [6144, 6144, 1, 6145, 6144, 6144, 6145, 6145] + - [38, 12039.0] + - - [6527, 6528, 1, 6528, 6527, 6527, 6528, 6528] + - [58, 11607.0] + - - [6529, 6528, 1, 6528, 6529, 6529, 6528, 6528] + - [58, 11473.0] + - - [6528, 6527, 1, 6528, 6528, 6528, 6528, 6528] + - [58, 11657.0] + - - [6528, 6529, 1, 6528, 6528, 6528, 6528, 6528] + - [38, 11756.0] + - - [6528, 6528, 1, 6527, 6528, 6528, 6527, 6527] + - [49, 11659.0] + - - [6528, 6528, 1, 6529, 6528, 6528, 6529, 6529] + - [49, 11802.0] + - - [6911, 6912, 1, 6912, 6911, 6911, 6912, 6912] + - [60, 11830.0] + - - [6913, 6912, 1, 6912, 6913, 6913, 6912, 6912] + - [60, 11588.0] + - - [6912, 6911, 1, 6912, 6912, 6912, 6912, 6912] + - [40, 11845.0] + - - [6912, 6913, 1, 6912, 6912, 6912, 6912, 6912] + - [58, 11466.0] + - - [6912, 6912, 1, 6911, 6912, 6912, 6911, 6911] + - [25, 12093.0] + - - [6912, 6912, 1, 6913, 6912, 6912, 6913, 6913] + - [29, 12117.0] + - - [7295, 7296, 1, 7296, 7295, 7295, 7296, 7296] + - [60, 12119.0] + - - [7297, 7296, 1, 7296, 7297, 7297, 7296, 7296] + - [58, 11641.0] + - - [7296, 7295, 1, 7296, 7296, 7296, 7296, 7296] + - [51, 11762.0] + - - [7296, 7297, 1, 7296, 7296, 7296, 7296, 7296] + - [60, 11790.0] + - - [7296, 7296, 1, 7295, 7296, 7296, 7295, 7295] + - [5, 11951.0] + - - [7296, 7296, 1, 7297, 7296, 7296, 7297, 7297] + - [29, 11979.0] + - - [7679, 7680, 1, 7680, 7679, 7679, 7680, 7680] + - [18, 11346.0] + - - [7681, 7680, 1, 7680, 7681, 7681, 7680, 7680] + - [29, 11859.0] + - - [7680, 7679, 1, 7680, 7680, 7680, 7680, 7680] + - [60, 11756.0] + - - [7680, 7681, 1, 7680, 7680, 7680, 7680, 7680] + - [60, 11452.0] + - - [7680, 7680, 1, 7679, 7680, 7680, 7679, 7679] + - [40, 12066.0] + - - [7680, 7680, 1, 7681, 7680, 7680, 7681, 7681] + - [51, 12046.0] + - - [1152, 1152, 1, 1152, 1152, 1152, 1152, 1152] + - [13, 8578.0] + - - [1536, 1536, 1, 1536, 1536, 1536, 1536, 1536] + - [18, 10604.0] + - - [1920, 1920, 1, 1920, 1920, 1920, 1920, 1920] + - [58, 10948.0] + - - [2304, 2304, 1, 2304, 2304, 2304, 2304, 2304] + - [5, 11251.0] + - - [2688, 2688, 1, 2688, 2688, 2688, 2688, 2688] + - [49, 11140.0] + - - [3456, 3456, 1, 3456, 3456, 3456, 3456, 3456] + - [27, 10725.0] + - - [3840, 3840, 1, 3840, 3840, 3840, 3840, 3840] + - [58, 10311.0] + - - [4224, 4224, 1, 4224, 4224, 4224, 4224, 4224] + - [27, 11267.0] + - - [4608, 4608, 1, 4608, 4608, 4608, 4608, 4608] + - [38, 10852.0] + - - [4992, 4992, 1, 4992, 4992, 4992, 4992, 4992] + - [49, 12138.0] + - - [5376, 5376, 1, 5376, 5376, 5376, 5376, 5376] + - [60, 11426.0] + - - [5760, 5760, 1, 5760, 5760, 5760, 5760, 5760] + - [38, 11411.0] + - - [6144, 6144, 1, 6144, 6144, 6144, 6144, 6144] + - [38, 10632.0] + - - [6528, 6528, 1, 6528, 6528, 6528, 6528, 6528] + - [58, 12076.0] + - - [6912, 6912, 1, 6912, 6912, 6912, 6912, 6912] + - [60, 11818.0] + - - [7296, 7296, 1, 7296, 7296, 7296, 7296, 7296] + - [38, 12015.0] + - - [7680, 7680, 1, 7680, 7680, 7680, 7680, 7680] + - [60, 11731.0] + - - [256, 128, 49, 1152, 256, 256, 1152, 1152] + - [58, 10226.0] + - - [256, 128, 121, 120, 256, 256, 120, 120] + - [22, 8368.0] + - - [256, 128, 169, 120, 256, 256, 120, 120] + - [22, 8906.0] + - - [256, 128, 36, 120, 256, 256, 120, 120] + - [0, 7538.0] + - - [256, 128, 49, 120, 256, 256, 120, 120] + - [44, 7462.0] + - - [256, 128, 64, 120, 256, 256, 120, 120] + - [44, 7186.0] + - - [256, 128, 36, 12000, 256, 256, 12000, 12000] + - [38, 8942.0] + - - [256, 128, 49, 1216, 256, 256, 1216, 1216] + - [27, 10750.0] + - - [256, 128, 121, 18, 256, 256, 18, 18] + - [0, 2741.0] + - - [256, 128, 169, 18, 256, 256, 18, 18] + - [0, 3090.0] + - - [256, 128, 36, 18, 256, 256, 18, 18] + - [15, 2397.0] + - - [256, 128, 49, 18, 256, 256, 18, 18] + - [4, 2396.0] + - - [256, 128, 64, 18, 256, 256, 18, 18] + - [15, 2658.0] + - - [256, 128, 36, 1800, 256, 256, 1800, 1800] + - [16, 9906.0] + - - [256, 128, 121, 19, 256, 256, 19, 19] + - [4, 3157.0] + - - [256, 128, 169, 19, 256, 256, 19, 19] + - [11, 2792.0] + - - [256, 128, 36, 19, 256, 256, 19, 19] + - [0, 2379.0] + - - [256, 128, 49, 19, 256, 256, 19, 19] + - [15, 2538.0] + - - [256, 128, 64, 19, 256, 256, 19, 19] + - [15, 2737.0] + - - [256, 128, 36, 1900, 256, 256, 1900, 1900] + - [49, 9875.0] + - - [256, 128, 49, 480, 256, 256, 480, 480] + - [1, 10067.0] + - - [256, 128, 81, 480, 256, 256, 480, 480] + - [27, 8142.0] + - - [256, 128, 64, 5880, 256, 256, 5880, 5880] + - [58, 9388.0] + - - [256, 128, 49, 72, 256, 256, 72, 72] + - [2, 5984.0] + - - [256, 128, 81, 72, 256, 256, 72, 72] + - [44, 5987.0] + - - [256, 128, 49, 76, 256, 256, 76, 76] + - [28, 5347.0] + - - [256, 128, 81, 76, 256, 256, 76, 76] + - [44, 5777.0] + - - [256, 128, 49, 7680, 256, 256, 7680, 7680] + - [21, 3512.0] + - - [256, 128, 64, 882, 256, 256, 882, 882] + - [5, 9866.0] + - - [256, 128, 64, 931, 256, 256, 931, 931] + - [5, 9818.0] + - - [256, 256, 49, 1152, 256, 256, 1152, 1152] + - [18, 10896.0] + - - [256, 256, 36, 12000, 256, 256, 12000, 12000] + - [31, 8650.0] + - - [256, 256, 49, 1216, 256, 256, 1216, 1216] + - [51, 11642.0] + - - [256, 256, 36, 1800, 256, 256, 1800, 1800] + - [7, 10794.0] + - - [256, 256, 36, 1900, 256, 256, 1900, 1900] + - [7, 10809.0] + - - [256, 256, 64, 5880, 256, 256, 5880, 5880] + - [0, 10277.0] + - - [256, 256, 49, 7680, 256, 256, 7680, 7680] + - [32, 7876.0] + - - [256, 256, 64, 882, 256, 256, 882, 882] + - [49, 11172.0] + - - [256, 256, 64, 931, 256, 256, 931, 931] + - [49, 11361.0] + - - [340, 256, 49, 1152, 340, 340, 1152, 1152] + - [49, 10135.0] + - - [340, 256, 36, 120, 340, 340, 120, 120] + - [2, 8660.0] + - - [340, 256, 49, 120, 340, 340, 120, 120] + - [12, 8988.0] + - - [340, 256, 64, 120, 340, 340, 120, 120] + - [1, 9092.0] + - - [340, 256, 36, 12000, 340, 340, 12000, 12000] + - [60, 8409.0] + - - [340, 256, 49, 1216, 340, 340, 1216, 1216] + - [23, 10441.0] + - - [340, 256, 36, 18, 340, 340, 18, 18] + - [2, 2869.0] + - - [340, 256, 49, 18, 340, 340, 18, 18] + - [28, 3039.0] + - - [340, 256, 64, 18, 340, 340, 18, 18] + - [11, 3199.0] + - - [340, 256, 36, 1800, 340, 340, 1800, 1800] + - [1, 10422.0] + - - [340, 256, 36, 19, 340, 340, 19, 19] + - [2, 3038.0] + - - [340, 256, 49, 19, 340, 340, 19, 19] + - [6, 3143.0] + - - [340, 256, 64, 19, 340, 340, 19, 19] + - [6, 3310.0] + - - [340, 256, 36, 1900, 340, 340, 1900, 1900] + - [23, 10364.0] + - - [340, 256, 64, 5880, 340, 340, 5880, 5880] + - [0, 9242.0] + - - [340, 256, 49, 7680, 340, 340, 7680, 7680] + - [20, 5954.0] + - - [340, 256, 64, 882, 340, 340, 882, 882] + - [7, 10158.0] + - - [340, 256, 64, 931, 340, 340, 931, 931] + - [51, 10164.0] + - - [510, 256, 49, 120, 510, 510, 120, 120] + - [46, 9785.0] + - - [510, 256, 64, 120, 510, 510, 120, 120] + - [46, 10019.0] + - - [510, 256, 49, 18, 510, 510, 18, 18] + - [50, 3500.0] + - - [510, 256, 64, 18, 510, 510, 18, 18] + - [53, 3557.0] + - - [510, 256, 49, 19, 510, 510, 19, 19] + - [44, 3626.0] + - - [510, 256, 64, 19, 510, 510, 19, 19] + - [50, 3750.0] + - - [510, 256, 36, 480, 510, 510, 480, 480] + - [1, 11253.0] + - - [510, 256, 36, 72, 510, 510, 72, 72] + - [46, 7231.0] + - - [510, 256, 36, 76, 510, 510, 76, 76] + - [35, 7433.0] + - - [510, 512, 36, 1080, 510, 510, 1080, 1080] + - [45, 12072.0] + - - [510, 512, 36, 162, 510, 510, 162, 162] + - [27, 10355.0] + - - [510, 512, 36, 171, 510, 510, 171, 171] + - [45, 10668.0] + - - [510, 512, 49, 1920, 510, 510, 1920, 1920] + - [51, 10168.0] + - - [510, 512, 64, 1920, 510, 510, 1920, 1920] + - [4, 9444.0] + - - [510, 512, 49, 288, 510, 510, 288, 288] + - [45, 11432.0] + - - [510, 512, 64, 288, 510, 510, 288, 288] + - [23, 11718.0] + - - [510, 512, 36, 3000, 510, 510, 3000, 3000] + - [1, 10777.0] + - - [510, 512, 49, 304, 510, 510, 304, 304] + - [23, 11609.0] + - - [510, 512, 64, 304, 510, 510, 304, 304] + - [45, 11718.0] + - - [510, 512, 36, 450, 510, 510, 450, 450] + - [27, 11535.0] + - - [510, 512, 36, 475, 510, 510, 475, 475] + - [1, 11612.0] + - - [510, 512, 49, 480, 510, 510, 480, 480] + - [23, 11843.0] + - - [510, 512, 64, 480, 510, 510, 480, 480] + - [23, 11939.0] + - - [510, 512, 49, 72, 510, 510, 72, 72] + - [46, 8944.0] + - - [510, 512, 64, 72, 510, 510, 72, 72] + - [46, 9118.0] + - - [510, 512, 49, 76, 510, 510, 76, 76] + - [39, 8905.0] + - - [510, 512, 64, 76, 510, 510, 76, 76] + - [27, 9269.0] + - - [512, 256, 81, 1080, 512, 512, 1080, 1080] + - [49, 11732.0] + - - [512, 256, 25, 12000, 512, 512, 12000, 12000] + - [36, 8730.0] + - - [512, 256, 81, 162, 512, 512, 162, 162] + - [5, 10481.0] + - - [512, 256, 81, 171, 512, 512, 171, 171] + - [5, 10813.0] + - - [512, 256, 25, 1800, 512, 512, 1800, 1800] + - [29, 12024.0] + - - [512, 256, 25, 1900, 512, 512, 1900, 1900] + - [7, 12034.0] + - - [512, 256, 121, 1920, 512, 512, 1920, 1920] + - [18, 7685.0] + - - [512, 256, 169, 1920, 512, 512, 1920, 1920] + - [29, 11275.0] + - - [512, 256, 49, 1920, 512, 512, 1920, 1920] + - [40, 11258.0] + - - [512, 256, 121, 288, 512, 512, 288, 288] + - [5, 11689.0] + - - [512, 256, 169, 288, 512, 512, 288, 288] + - [1, 11880.0] + - - [512, 256, 49, 288, 512, 512, 288, 288] + - [1, 11566.0] + - - [512, 256, 25, 3000, 512, 512, 3000, 3000] + - [51, 12168.0] + - - [512, 256, 81, 3000, 512, 512, 3000, 3000] + - [1, 10411.0] + - - [512, 256, 121, 304, 512, 512, 304, 304] + - [5, 11712.0] + - - [512, 256, 169, 304, 512, 512, 304, 304] + - [5, 11849.0] + - - [512, 256, 49, 304, 512, 512, 304, 304] + - [1, 11572.0] + - - [512, 256, 25, 450, 512, 512, 450, 450] + - [1, 11618.0] + - - [512, 256, 81, 450, 512, 512, 450, 450] + - [5, 11476.0] + - - [512, 256, 25, 475, 512, 512, 475, 475] + - [1, 11516.0] + - - [512, 256, 81, 475, 512, 512, 475, 475] + - [27, 11519.0] + - - [512, 256, 121, 480, 512, 512, 480, 480] + - [1, 12053.0] + - - [512, 256, 169, 480, 512, 512, 480, 480] + - [51, 12093.0] + - - [512, 256, 49, 5880, 512, 512, 5880, 5880] + - [60, 12146.0] + - - [512, 256, 121, 72, 512, 512, 72, 72] + - [24, 10381.0] + - - [512, 256, 169, 72, 512, 512, 72, 72] + - [0, 9630.0] + - - [512, 256, 121, 76, 512, 512, 76, 76] + - [24, 9971.0] + - - [512, 256, 169, 76, 512, 512, 76, 76] + - [1, 9225.0] + - - [512, 256, 49, 882, 512, 512, 882, 882] + - [29, 11697.0] + - - [512, 256, 49, 931, 512, 512, 931, 931] + - [7, 11701.0] + - - [2304, 512, 1, 100, 2304, 2304, 100, 100] + - [0, 7843.0] + - - [2304, 512, 1, 361, 2304, 2304, 361, 361] + - [45, 9150.0] + - - [4608, 510, 1, 100, 4608, 4608, 100, 100] + - [53, 7079.0] + - - [4608, 510, 1, 361, 4608, 4608, 361, 361] + - [45, 9599.0] + - - [8192, 7680, 1, 8192, 8192, 8192, 8192, 8192] + - [60, 12062.0] + - - [4096, 3840, 1, 4096, 4096, 4096, 4096, 4096] + - [18, 12105.0] + - - [2048, 1920, 1, 2048, 2048, 2048, 2048, 2048] + - [16, 11800.0] + - - [30522, 616, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 11432.0] + - - [128, 128, 128, 64, 128, 128, 64, 64] + - [22, 7279.0] + - - [128, 128, 160, 64, 128, 128, 64, 64] + - [50, 8248.0] + - - [1024, 1280, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11440.0] + - - [1024, 1280, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 11962.0] + - - [4096, 1280, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11856.0] + - - [30522, 200, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 9179.0] + - - [128, 128, 624, 64, 128, 128, 64, 64] + - [59, 9499.0] + - - [1024, 4992, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 11584.0] + - - [1024, 4992, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 11787.0] + - - [4096, 4992, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11975.0] + - - [30522, 780, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 10470.0] + - - [30522, 308, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 9473.0] + - - [128, 128, 640, 64, 128, 128, 64, 64] + - [28, 9423.0] + - - [1024, 5120, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11944.0] + - - [1024, 5120, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 12040.0] + - - [4096, 5120, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 12202.0] + - - [30522, 800, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 10836.0] + - - [128, 128, 656, 64, 128, 128, 64, 64] + - [50, 9672.0] + - - [1024, 5248, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 11564.0] + - - [1024, 5248, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 11252.0] + - - [4096, 5248, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 12012.0] + - - [30522, 820, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 10988.0] + - - [512, 512, 80, 64, 512, 512, 64, 64] + - [5, 10562.0] + - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11623.0] + - - [1024, 2560, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 12167.0] + - - [4096, 2560, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 12126.0] + - - [30522, 385, 1, 1024, 30522, 30522, 1024, 1024] + - [58, 8945.0] + - - [30522, 462, 1, 1024, 30522, 30522, 1024, 1024] + - [38, 10632.0] + - - [128, 128, 144, 64, 128, 128, 64, 64] + - [22, 7402.0] + - - [1024, 1152, 1, 1024, 1024, 1024, 1024, 1024] + - [60, 10160.0] + - - [1024, 1152, 1, 4096, 1024, 1024, 4096, 4096] + - [60, 10721.0] + - - [4096, 1152, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 11390.0] + - - [30522, 180, 1, 1024, 30522, 30522, 1024, 1024] + - [38, 8235.0] + - - [1024, 8192, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11957.0] + - - [1024, 8192, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11941.0] + - - [1024, 9600, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 11965.0] + - - [1024, 9600, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11517.0] + - - [4096, 8192, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 12171.0] + - - [4096, 9600, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 12122.0] + - - [33712, 8192, 1, 1024, 33712, 33712, 1024, 1024] + - [40, 12418.0] + - - [33712, 9600, 1, 1024, 33712, 33712, 1024, 1024] + - [40, 12242.0] + - - [1024, 10064, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11879.0] + - - [1024, 10064, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11786.0] + - - [1024, 10080, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 11909.0] + - - [1024, 10080, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11976.0] + - - [1024, 6528, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 11590.0] + - - [1024, 6528, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11288.0] + - - [1024, 7104, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 11502.0] + - - [1024, 7104, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11243.0] + - - [1024, 8064, 1, 1024, 1024, 1024, 1024, 1024] + - [18, 11684.0] + - - [1024, 8064, 1, 4096, 1024, 1024, 4096, 4096] + - [18, 11786.0] + - - [1024, 9216, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 11911.0] + - - [1024, 9216, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 11667.0] + - - [4096, 10064, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 12023.0] + - - [4096, 10080, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 12049.0] + - - [4096, 6528, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 12157.0] + - - [4096, 7104, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 12043.0] + - - [4096, 8064, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 11987.0] + - - [4096, 9216, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 12185.0] + - - [42720, 10080, 1, 1024, 42720, 42720, 1024, 1024] + - [60, 12239.0] + - - [42720, 6528, 1, 1024, 42720, 42720, 1024, 1024] + - [38, 12206.0] + - - [42720, 7104, 1, 1024, 42720, 42720, 1024, 1024] + - [60, 12313.0] + - - [1024, 32768, 1, 480, 1024, 1024, 480, 480] + - [1, 12400.0] + - - [30592, 1024, 1, 2048, 30592, 30592, 2048, 2048] + - [40, 12317.0] + - - [6144, 1024, 1, 2048, 6144, 6144, 2048, 2048] + - [18, 11825.0] + - - [8192, 1024, 1, 2048, 8192, 8192, 2048, 2048] + - [18, 12136.0] + - - [30592, 8192, 1, 1024, 30592, 30592, 1024, 1024] + - [60, 12407.0] + - - [3072, 8192, 1, 1024, 3072, 3072, 1024, 1024] + - [40, 12121.0] + - - [512, 512, 256, 64, 512, 512, 64, 64] + - [0, 8699.0] + - - [30592, 2048, 1, 1024, 30592, 30592, 1024, 1024] + - [60, 12358.0] + - - [30592, 4096, 1, 1024, 30592, 30592, 1024, 1024] + - [60, 12418.0] + - - [3072, 4096, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 11923.0] + - - [1920, 2048, 1, 2560, 1920, 1920, 2560, 2560] + - [16, 11864.0] + - - [2560, 2048, 1, 2560, 2560, 2560, 2560, 2560] + - [40, 12308.0] + - - [2560, 2048, 1, 640, 2560, 2560, 640, 640] + - [29, 11986.0] + - - [7680, 2048, 1, 2560, 7680, 7680, 2560, 2560] + - [18, 12262.0] + - - [512, 512, 40, 64, 512, 512, 64, 64] + - [11, 10085.0] + - - [1536, 4096, 1, 1536, 1536, 1536, 1536, 1536] + - [40, 11814.0] + - - [1536, 4096, 1, 6144, 1536, 1536, 6144, 6144] + - [40, 11930.0] + - - [4608, 4096, 1, 1536, 4608, 4608, 1536, 1536] + - [40, 12279.0] + - - [50304, 4096, 1, 1536, 50304, 50304, 1536, 1536] + - [40, 12448.0] + - - [6144, 4096, 1, 1536, 6144, 6144, 1536, 1536] + - [18, 12254.0] + - - [1024, 1024, 64, 96, 1024, 1024, 96, 96] + - [16, 11089.0] + - - [1536, 8192, 1, 1536, 1536, 1536, 1536, 1536] + - [16, 11918.0] + - - [1536, 8192, 1, 6144, 1536, 1536, 6144, 6144] + - [58, 11948.0] + - - [4608, 8192, 1, 1536, 4608, 4608, 1536, 1536] + - [40, 12364.0] + - - [50304, 8192, 1, 1536, 50304, 50304, 1536, 1536] + - [51, 12465.0] + - - [6144, 8192, 1, 1536, 6144, 6144, 1536, 1536] + - [7, 12434.0] + - - [1024, 1024, 128, 96, 1024, 1024, 96, 96] + - [16, 11078.0] + - - [1024, 16384, 1, 1024, 1024, 1024, 1024, 1024] + - [40, 12003.0] + - - [1024, 16384, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 12073.0] + - - [3072, 16384, 1, 1024, 3072, 3072, 1024, 1024] + - [60, 12249.0] + - - [4096, 16384, 1, 1024, 4096, 4096, 1024, 1024] + - [18, 12171.0] + - - [50304, 16384, 1, 1024, 50304, 50304, 1024, 1024] + - [51, 12399.0] + - - [1024, 1024, 256, 64, 1024, 1024, 64, 64] + - [0, 8598.0] + - - [50304, 2048, 1, 1024, 50304, 50304, 1024, 1024] + - [60, 12356.0] + - - [1024, 1024, 32, 64, 1024, 1024, 64, 64] + - [0, 10106.0] + - - [50304, 4096, 1, 1024, 50304, 50304, 1024, 1024] + - [60, 12401.0] + - - [1024, 1024, 64, 64, 1024, 1024, 64, 64] + - [0, 10016.0] + - - [50304, 8192, 1, 1024, 50304, 50304, 1024, 1024] + - [60, 11939.0] + - - [1024, 1024, 128, 64, 1024, 1024, 64, 64] + - [26, 6068.0] + - - [30528, 8192, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 11634.0] + - - [128, 128, 1024, 64, 128, 128, 64, 64] + - [57, 6956.0] + - - [1024, 3456, 1, 1024, 1024, 1024, 1024, 1024] + - [38, 10354.0] + - - [1024, 3456, 1, 480, 1024, 1024, 480, 480] + - [5, 11082.0] + - - [512, 3456, 1, 1024, 512, 512, 1024, 1024] + - [18, 10022.0] + - - [512, 3456, 1, 13, 512, 512, 13, 13] + - [6, 1638.0] + - - [512, 4096, 1, 13, 512, 512, 13, 13] + - [0, 2147.0] + - - [512, 6912, 1, 13, 512, 512, 13, 13] + - [16, 2142.0] + - - [30528, 640, 1, 1024, 30528, 30528, 1024, 1024] + - [58, 6159.0] + - - [30528, 1280, 1, 1024, 30528, 30528, 1024, 1024] + - [43, 8380.0] + - - [30528, 1600, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 8829.0] + - - [1024, 10240, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10244.0] + - - [4096, 10240, 1, 1024, 4096, 4096, 1024, 1024] + - [50, 10048.0] + - - [1024, 10240, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8614.0] + - - [128, 128, 1280, 64, 128, 128, 64, 64] + - [26, 5162.0] + - - [1024, 10496, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8493.0] + - - [30528, 1640, 1, 1024, 30528, 30528, 1024, 1024] + - [58, 9036.0] + - - [4096, 10496, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9708.0] + - - [1024, 10496, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 9851.0] + - - [128, 128, 1312, 64, 128, 128, 64, 64] + - [15, 6886.0] + - - [30528, 160, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 7061.0] + - - [30528, 240, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 6646.0] + - - [1024, 6144, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10929.0] + - - [4096, 6144, 1, 1024, 4096, 4096, 1024, 1024] + - [58, 10774.0] + - - [1024, 6144, 1, 4096, 1024, 1024, 4096, 4096] + - [40, 7218.0] + - - [512, 512, 192, 64, 512, 512, 64, 64] + - [63, 7486.0] + - - [1024, 10224, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 9735.0] + - - [1024, 10192, 1, 1024, 1024, 1024, 1024, 1024] + - [58, 10656.0] + - - [1024, 10208, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10065.0] + - - [1024, 10224, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8787.0] + - - [4096, 10224, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9839.0] + - - [3072, 10224, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 9380.0] + - - [3072, 10240, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 9390.0] + - - [1024, 10192, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8419.0] + - - [4096, 10192, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 9823.0] + - - [3072, 10192, 1, 1024, 3072, 3072, 1024, 1024] + - [59, 10539.0] + - - [3072, 10200, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 8891.0] + - - [1024, 10184, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 9922.0] + - - [3072, 10208, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 9420.0] + - - [1024, 10208, 1, 4096, 1024, 1024, 4096, 4096] + - [38, 8613.0] + - - [4096, 10208, 1, 1024, 4096, 4096, 1024, 1024] + - [40, 9763.0] + - - [2048, 10224, 1, 1024, 2048, 2048, 1024, 1024] + - [16, 7680.0] + - - [2048, 10240, 1, 1024, 2048, 2048, 1024, 1024] + - [16, 8065.0] + - - [1024, 10120, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 9951.0] + - - [2048, 10192, 1, 1024, 2048, 2048, 1024, 1024] + - [49, 10139.0] + - - [1024, 10152, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10198.0] + - - [3072, 10080, 1, 1024, 3072, 3072, 1024, 1024] + - [16, 9227.0] + - - [256, 256, 25, 12544, 256, 256, 12544, 12544] + - [10, 7208.0] + - - [256, 256, 49, 3200, 256, 256, 3200, 3200] + - [60, 10559.0] + - - [256, 256, 25, 6272, 256, 256, 6272, 6272] + - [20, 9652.0] + - - [256, 256, 49, 6400, 256, 256, 6400, 6400] + - [62, 5554.0] + - - [512, 512, 49, 1152, 512, 512, 1152, 1152] + - [27, 11679.0] + - - [512, 512, 25, 2048, 512, 512, 2048, 2048] + - [12, 6240.0] + - - [512, 512, 49, 2304, 512, 512, 2304, 2304] + - [58, 7474.0] + - - [512, 512, 25, 4096, 512, 512, 4096, 4096] + - [9, 8709.0] + - - [128, 128, 2048, 64, 128, 128, 64, 64] + - [19, 5090.0] + - - [30528, 2560, 1, 1024, 30528, 30528, 1024, 1024] + - [60, 9930.0] + - - [128, 128, 1536, 64, 128, 128, 64, 64] + - [8, 5099.0] + - - [1024, 12288, 1, 1024, 1024, 1024, 1024, 1024] + - [58, 10519.0] + - - [1024, 12288, 1, 4096, 1024, 1024, 4096, 4096] + - [20, 9533.0] + - - [30528, 1920, 1, 1024, 30528, 30528, 1024, 1024] + - [58, 11160.0] + - - [4096, 12288, 1, 1024, 4096, 4096, 1024, 1024] + - [16, 10163.0] + - - [128, 128, 81, 12544, 128, 128, 12544, 12544] + - [19, 3973.0] + - - [128, 128, 121, 9216, 128, 128, 9216, 9216] + - [57, 4583.0] + - - [128, 128, 169, 6400, 128, 128, 6400, 6400] + - [42, 4517.0] + - - [256, 256, 36, 4096, 256, 256, 4096, 4096] + - [42, 7283.0] + - - [256, 256, 49, 2304, 256, 256, 2304, 2304] + - [20, 9448.0] + - - [256, 256, 64, 2304, 256, 256, 2304, 2304] + - [20, 9567.0] + - - [256, 256, 81, 4096, 256, 256, 4096, 4096] + - [42, 4984.0] + - - [256, 256, 121, 2304, 256, 256, 2304, 2304] + - [20, 8063.0] + - - [256, 256, 169, 2304, 256, 256, 2304, 2304] + - [38, 6181.0] + - - [512, 512, 81, 1024, 512, 512, 1024, 1024] + - [35, 7519.0] + - - [512, 512, 121, 1024, 512, 512, 1024, 1024] + - [16, 7087.0] + - - [512, 512, 169, 1024, 512, 512, 1024, 1024] + - [39, 7710.0] + - - [512, 512, 36, 1024, 512, 512, 1024, 1024] + - [58, 7617.0] + - - [512, 512, 49, 1024, 512, 512, 1024, 1024] + - [43, 8559.0] + - - [512, 512, 64, 1024, 512, 512, 1024, 1024] + - [17, 7843.0] + - - [128, 128, 192, 64, 128, 128, 64, 64] + - [13, 5013.0] + - - [768, 2048, 1, 768, 768, 768, 768, 768] + - [38, 10060.0] + - - [3072, 2048, 1, 768, 3072, 3072, 768, 768] + - [27, 11092.0] + - - [768, 2048, 1, 3072, 768, 768, 3072, 3072] + - [58, 10698.0] + - - [384, 384, 144, 64, 384, 384, 64, 64] + - [57, 7151.0] + - - [768, 4608, 1, 768, 768, 768, 768, 768] + - [16, 11196.0] + - - [3072, 4608, 1, 768, 3072, 3072, 768, 768] + - [18, 11448.0] + - - [768, 4608, 1, 3072, 768, 768, 3072, 3072] + - [60, 9637.0] + - - [512, 512, 48, 64, 512, 512, 64, 64] + - [15, 7970.0] + - - [128, 128, 256, 64, 128, 128, 64, 64] + - [4, 6129.0] + - - [384, 384, 192, 64, 384, 384, 64, 64] + - [0, 7324.0] + - - [1024, 4608, 1, 1024, 1024, 1024, 1024, 1024] + - [16, 10576.0] + - - [4096, 4608, 1, 1024, 4096, 4096, 1024, 1024] + - [17, 8126.0] + - - [1024, 4608, 1, 4096, 1024, 1024, 4096, 4096] + - [5, 8321.0] + - - [256, 256, 36, 432, 256, 256, 432, 432] + - [45, 8722.0] + - - [256, 256, 36, 456, 256, 256, 456, 456] + - [23, 9337.0] + - - [256, 256, 36, 504, 256, 256, 504, 504] + - [25, 9663.0] + - - [256, 256, 49, 1120, 256, 256, 1120, 1120] + - [18, 11365.0] + - - [256, 256, 36, 442, 256, 256, 442, 442] + - [44, 9979.0] + - - [256, 256, 49, 950, 256, 256, 950, 950] + - [18, 11089.0] + - - [256, 256, 64, 616, 256, 256, 616, 616] + - [5, 10883.0] + - - [256, 256, 64, 660, 256, 256, 660, 660] + - [49, 11005.0] + - - [256, 256, 36, 408, 256, 256, 408, 408] + - [44, 10175.0] + - - [256, 256, 49, 1008, 256, 256, 1008, 1008] + - [60, 11390.0] + - - [256, 256, 36, 462, 256, 256, 462, 462] + - [1, 9971.0] + - - [256, 256, 36, 468, 256, 256, 468, 468] + - [44, 10177.0] + - - [256, 256, 36, 494, 256, 256, 494, 494] + - [44, 10220.0] + - - [512, 512, 64, 48, 512, 512, 48, 48] + - [1, 7596.0] + - - [256, 256, 64, 140, 256, 256, 140, 140] + - [44, 9265.0] + - - [512, 512, 64, 56, 512, 512, 56, 56] + - [17, 7861.0] + - - [512, 512, 49, 90, 512, 512, 90, 90] + - [6, 9543.0] + - - [512, 512, 49, 60, 512, 512, 60, 60] + - [50, 7911.0] + - - [256, 256, 49, 864, 256, 256, 864, 864] + - [51, 11276.0] + - - [256, 256, 64, 224, 256, 256, 224, 224] + - [5, 10710.0] + - - [256, 256, 64, 176, 256, 256, 176, 176] + - [0, 10299.0] + - - [256, 256, 64, 154, 256, 256, 154, 154] + - [28, 9578.0] + - - [512, 512, 49, 80, 512, 512, 80, 80] + - [16, 10484.0] + - - [256, 256, 49, 1200, 256, 256, 1200, 1200] + - [7, 11489.0] + - - [256, 256, 64, 704, 256, 256, 704, 704] + - [5, 11293.0] + - - [256, 256, 64, 768, 256, 256, 768, 768] + - [60, 2353.0] + - - [256, 256, 49, 1160, 256, 256, 1160, 1160] + - [60, 11417.0] + - - [256, 256, 49, 320, 256, 256, 320, 320] + - [1, 11005.0] + - - [512, 512, 49, 70, 512, 512, 70, 70] + - [6, 8306.0] + - - [256, 256, 49, 1240, 256, 256, 1240, 1240] + - [29, 11541.0] + - - [256, 256, 36, 384, 256, 256, 384, 384] + - [5, 9828.0] + - - [1024, 2048, 1, 888, 1024, 1024, 888, 888] + - [1, 10807.0] + - - [1024, 2048, 1, 713, 1024, 1024, 713, 713] + - [12, 10106.0] + - - [1024, 2048, 1, 660, 1024, 1024, 660, 660] + - [23, 10170.0] + - - [1024, 2048, 1, 726, 1024, 1024, 726, 726] + - [1, 10336.0] + - - [1024, 2048, 1, 672, 1024, 1024, 672, 672] + - [1, 10424.0] + - - [1024, 2048, 1, 850, 1024, 1024, 850, 850] + - [1, 10281.0] + - - [1024, 2048, 1, 805, 1024, 1024, 805, 805] + - [1, 10360.0] + - - [1024, 2048, 1, 864, 1024, 1024, 864, 864] + - [23, 10568.0] + - - [1024, 2048, 1, 768, 1024, 1024, 768, 768] + - [16, 9994.0] + - - [1024, 2048, 1, 950, 1024, 1024, 950, 950] + - [23, 10327.0] + - - [1024, 1024, 160, 96, 1024, 1024, 96, 96] + - [7, 6669.0] + - - [2880, 16384, 1, 1920, 2880, 2880, 1920, 1920] + - [36, 11623.0] + - - [1920, 16384, 1, 960, 1920, 1920, 960, 960] + - [54, 12358.0] + - - [3840, 16384, 1, 1920, 3840, 3840, 1920, 1920] + - [40, 11879.0] + - - [1920, 16384, 1, 3840, 1920, 1920, 3840, 3840] + - [5, 11971.0] + - - [25216, 16384, 1, 1920, 25216, 25216, 1920, 1920] + - [40, 12235.0] + - - [1024, 1024, 40, 96, 1024, 1024, 96, 96] + - [16, 10954.0] + - - [2880, 4096, 1, 1920, 2880, 2880, 1920, 1920] + - [23, 10775.0] + - - [1920, 4096, 1, 960, 1920, 1920, 960, 960] + - [23, 12280.0] + - - [3840, 4096, 1, 1920, 3840, 3840, 1920, 1920] + - [18, 12210.0] + - - [1920, 4096, 1, 3840, 1920, 1920, 3840, 3840] + - [60, 8933.0] + - - [25216, 4096, 1, 1920, 25216, 25216, 1920, 1920] + - [40, 11744.0] + - - [1024, 1024, 80, 96, 1024, 1024, 96, 96] + - [16, 10989.0] + - - [2880, 8192, 1, 1920, 2880, 2880, 1920, 1920] + - [16, 11723.0] + - - [1920, 8192, 1, 960, 1920, 1920, 960, 960] + - [49, 9808.0] + - - [3840, 8192, 1, 1920, 3840, 3840, 1920, 1920] + - [40, 11349.0] + - - [1920, 8192, 1, 3840, 1920, 1920, 3840, 3840] + - [40, 11969.0] + - - [25216, 8192, 1, 1920, 25216, 25216, 1920, 1920] + - [40, 12110.0] + - - [1024, 1024, 96, 96, 1024, 1024, 96, 96] + - [15, 8288.0] + - - [1728, 16384, 1, 2304, 1728, 1728, 2304, 2304] + - [40, 10591.0] + - - [2304, 16384, 1, 576, 2304, 2304, 576, 576] + - [25, 10803.0] + - - [2304, 16384, 1, 2304, 2304, 2304, 2304, 2304] + - [7, 11222.0] + - - [12672, 16384, 1, 2304, 12672, 12672, 2304, 2304] + - [60, 12029.0] + - - [1024, 1024, 24, 96, 1024, 1024, 96, 96] + - [16, 9417.0] + - - [1728, 4096, 1, 2304, 1728, 1728, 2304, 2304] + - [38, 10189.0] + - - [2304, 4096, 1, 576, 2304, 2304, 576, 576] + - [12, 12089.0] + - - [2304, 4096, 1, 2304, 2304, 2304, 2304, 2304] + - [40, 9841.0] + - - [12672, 4096, 1, 2304, 12672, 12672, 2304, 2304] + - [60, 11314.0] + - - [1024, 1024, 48, 96, 1024, 1024, 96, 96] + - [16, 10991.0] + - - [1728, 8192, 1, 2304, 1728, 1728, 2304, 2304] + - [58, 9461.0] + - - [2304, 8192, 1, 576, 2304, 2304, 576, 576] + - [34, 12076.0] + - - [2304, 8192, 1, 2304, 2304, 2304, 2304, 2304] + - [29, 11828.0] + - - [12672, 8192, 1, 2304, 12672, 12672, 2304, 2304] + - [60, 11838.0] + - - [1024, 1024, 16, 96, 1024, 1024, 96, 96] + - [11, 10556.0] + - - [1152, 4096, 1, 3072, 1152, 1152, 3072, 3072] + - [27, 9991.0] + - - [3072, 4096, 1, 384, 3072, 3072, 384, 384] + - [16, 11579.0] + - - [1536, 4096, 1, 3072, 1536, 1536, 3072, 3072] + - [62, 8920.0] + - - [3072, 4096, 1, 1536, 3072, 3072, 1536, 1536] + - [40, 9303.0] + - - [6400, 4096, 1, 3072, 6400, 6400, 3072, 3072] + - [38, 10196.0] + - - [1024, 1024, 32, 96, 1024, 1024, 96, 96] + - [16, 10911.0] + - - [1152, 8192, 1, 3072, 1152, 1152, 3072, 3072] + - [38, 8223.0] + - - [3072, 8192, 1, 384, 3072, 3072, 384, 384] + - [5, 11903.0] + - - [1536, 8192, 1, 3072, 1536, 1536, 3072, 3072] + - [38, 8898.0] + - - [3072, 8192, 1, 1536, 3072, 3072, 1536, 1536] + - [58, 9808.0] + - - [6400, 8192, 1, 3072, 6400, 6400, 3072, 3072] + - [16, 11260.0] + - - [2048, 4096, 1, 2048, 2048, 2048, 2048, 2048] + - [18, 10766.0] + - - [2048, 4096, 1, 4096, 2048, 2048, 4096, 4096] + - [58, 6550.0] + - - [29000, 199, 1, 2048, 29000, 29000, 2048, 2048] + - [18, 4026.0] + - - [29000, 221, 1, 2048, 29000, 29000, 2048, 2048] + - [43, 7011.0] + - - [29000, 224, 1, 2048, 29000, 29000, 2048, 2048] + - [60, 7768.0] + - - [29000, 229, 1, 2048, 29000, 29000, 2048, 2048] + - [43, 6895.0] + - - [29000, 234, 1, 2048, 29000, 29000, 2048, 2048] + - [63, 7315.0] + - - [29000, 242, 1, 2048, 29000, 29000, 2048, 2048] + - [43, 7355.0] + - - [29000, 246, 1, 2048, 29000, 29000, 2048, 2048] + - [18, 8926.0] + - - [29000, 247, 1, 2048, 29000, 29000, 2048, 2048] + - [40, 7197.0] + - - [29000, 256, 1, 2048, 29000, 29000, 2048, 2048] + - [21, 7151.0] + - - [29000, 262, 1, 2048, 29000, 29000, 2048, 2048] + - [29, 4902.0] + - - [29000, 264, 1, 2048, 29000, 29000, 2048, 2048] + - [61, 5750.0] + - - [29000, 265, 1, 2048, 29000, 29000, 2048, 2048] + - [59, 4358.0] + - - [29000, 274, 1, 2048, 29000, 29000, 2048, 2048] + - [39, 4343.0] + - - [29000, 277, 1, 2048, 29000, 29000, 2048, 2048] + - [28, 6057.0] + - - [4096, 256, 1, 12288, 4096, 4096, 12288, 12288] + - [63, 3963.0] + - - [2048, 256, 1, 13312, 2048, 2048, 13312, 13312] + - [67, 4118.0] + - - [4096, 256, 1, 15360, 4096, 4096, 15360, 15360] + - [17, 6271.0] + - - [2048, 512, 1, 16640, 2048, 2048, 16640, 16640] + - [64, 6470.0] + - - [4096, 256, 1, 14336, 4096, 4096, 14336, 14336] + - [13, 4465.0] + - - [1024, 1024, 1, 8192, 1024, 1024, 8192, 8192] + - [68, 9863.0] + - - [1024, 512, 1, 16384, 1024, 1024, 16384, 16384] + - [20, 7581.0] + - - [4096, 256, 1, 9216, 4096, 4096, 9216, 9216] + - [79, 3974.0] + - - [1024, 512, 1, 12288, 1024, 1024, 12288, 12288] + - [42, 7410.0] + - - [4096, 200, 1, 12288, 4096, 4096, 12288, 12288] + - [42, 5666.0] + - - [1024, 1024, 1, 13312, 1024, 1024, 13312, 13312] + - [76, 6277.0] + - - [2048, 256, 1, 16384, 2048, 2048, 16384, 16384] + - [77, 6475.0] + - - [2048, 512, 1, 16384, 2048, 2048, 16384, 16384] + - [85, 7119.0] + - - [1024, 1024, 1, 8320, 1024, 1024, 8320, 8320] + - [64, 10759.0] + - - [2048, 256, 1, 14336, 2048, 2048, 14336, 14336] + - [72, 6414.0] + - - [4096, 200, 1, 16640, 4096, 4096, 16640, 16640] + - [14, 5797.0] + - - [1024, 1024, 1, 16640, 1024, 1024, 16640, 16640] + - [48, 7395.0] + - - [1024, 1024, 1, 14336, 1024, 1024, 14336, 14336] + - [66, 6680.0] + - - [2048, 512, 1, 9216, 2048, 2048, 9216, 9216] + - [80, 8187.0] + - - [1024, 1024, 1, 15360, 1024, 1024, 15360, 15360] + - [27, 6865.0] + - - [2048, 512, 1, 8192, 2048, 2048, 8192, 8192] + - [20, 6108.0] + - - [2048, 512, 1, 13312, 2048, 2048, 13312, 13312] + - [71, 8496.0] + - - [1024, 1024, 1, 11264, 1024, 1024, 11264, 11264] + - [30, 6808.0] + - - [1024, 512, 1, 16640, 1024, 1024, 16640, 16640] + - [71, 6863.0] + - - [2048, 512, 1, 10240, 2048, 2048, 10240, 10240] + - [75, 7547.0] + - - [2048, 256, 1, 16640, 2048, 2048, 16640, 16640] + - [76, 8562.0] + - - [4096, 256, 1, 13312, 4096, 4096, 13312, 13312] + - [59, 7121.0] + - - [4096, 200, 1, 15360, 4096, 4096, 15360, 15360] + - [81, 5088.0] + - - [2048, 512, 1, 12288, 2048, 2048, 12288, 12288] + - [83, 6284.0] + - - [4096, 256, 1, 8192, 4096, 4096, 8192, 8192] + - [86, 5539.0] + - - [2048, 512, 1, 15360, 2048, 2048, 15360, 15360] + - [83, 5020.0] + - - [2048, 512, 1, 11264, 2048, 2048, 11264, 11264] + - [72, 8610.0] + - - [2048, 256, 1, 12288, 2048, 2048, 12288, 12288] + - [71, 8378.0] + - - [1024, 1024, 1, 12288, 1024, 1024, 12288, 12288] + - [72, 5843.0] + - - [4096, 256, 1, 16384, 4096, 4096, 16384, 16384] + - [73, 5684.0] + - - [2048, 256, 1, 15360, 2048, 2048, 15360, 15360] + - [87, 6407.0] + - - [2048, 512, 1, 8320, 2048, 2048, 8320, 8320] + - [78, 11130.0] + - - [1024, 1024, 1, 10240, 1024, 1024, 10240, 10240] + - [50, 8055.0] + - - [1024, 1024, 1, 9216, 1024, 1024, 9216, 9216] + - [85, 9267.0] + - - [4096, 200, 1, 16384, 4096, 4096, 16384, 16384] + - [52, 3959.0] + - - [2048, 512, 1, 14336, 2048, 2048, 14336, 14336] + - [72, 5896.0] + - - [1024, 512, 1, 13312, 1024, 1024, 13312, 13312] + - [50, 7702.0] + - - [4096, 256, 1, 8320, 4096, 4096, 8320, 8320] + - [83, 10842.0] + - - [4096, 200, 1, 13312, 4096, 4096, 13312, 13312] + - [59, 5885.0] + - - [1024, 512, 1, 14336, 1024, 1024, 14336, 14336] + - [35, 6629.0] + - - [4096, 256, 1, 11264, 4096, 4096, 11264, 11264] + - [84, 5747.0] + - - [4096, 256, 1, 10240, 4096, 4096, 10240, 10240] + - [82, 4222.0] + - - [4096, 200, 1, 14336, 4096, 4096, 14336, 14336] + - [38, 3515.0] + - - [4096, 256, 1, 16640, 4096, 4096, 16640, 16640] + - [39, 7871.0] + - - [1024, 512, 1, 15360, 1024, 1024, 15360, 15360] + - [70, 6071.0] + - - [1024, 1024, 1, 16384, 1024, 1024, 16384, 16384] + - [69, 7874.0] + - - [224, 192, 36, 10368, 224, 224, 10368, 10368] + - [68, 6055.0] + - - [320, 256, 9, 19584, 320, 320, 19584, 19584] + - [50, 7376.0] + - - [256, 256, 11, 13056, 256, 256, 13056, 13056] + - [80, 8945.0] + - - [320, 256, 9, 9792, 320, 320, 9792, 9792] + - [74, 9376.0] + - - [320, 256, 11, 13056, 320, 320, 13056, 13056] + - [71, 7487.0] + - - [256, 256, 9, 9792, 256, 256, 9792, 9792] + - [64, 10177.0] + - - [256, 224, 9, 19584, 256, 256, 19584, 19584] + - [65, 7703.0] + - - [256, 256, 9, 19584, 256, 256, 19584, 19584] + - [78, 8805.0] + - - [128, 128, 36, 12000, 128, 128, 12000, 12000] + - [38, 9415.0] + - - [128, 128, 49, 12800, 128, 128, 12800, 12800] + - [77, 6415.0] + - - [128, 128, 25, 25088, 128, 128, 25088, 25088] + - [77, 4777.0] + - - [128, 128, 49, 25600, 128, 128, 25600, 25600] + - [87, 4455.0] + - - [128, 128, 25, 50176, 128, 128, 50176, 50176] + - [87, 4520.0] + - - [128, 128, 36, 12544, 128, 128, 12544, 12544] + - [75, 7684.0] + - - [128, 128, 49, 9216, 128, 128, 9216, 9216] + - [87, 5611.0] + - - [1024, 1024, 1, 12544, 1024, 1024, 12544, 12544] + - [16, 9089.0] + - - [1024, 1000, 1, 12544, 1024, 1024, 12544, 12544] + - [18, 7785.0] + - - [1024, 512, 1, 1600, 1024, 1024, 1600, 1600] + - [93, 8105.0] + - - [2048, 512, 1, 100, 2048, 2048, 100, 100] + - [140, 5377.0] + - - [768, 640, 1, 768, 768, 768, 768, 768] + - [113, 8011.0] + - - [768, 1280, 1, 768, 768, 768, 768, 768] + - [136, 7799.0] + - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] + - [148, 7241.0] + - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] + - [115, 7785.0] + - - [30522, 120, 1, 1024, 30522, 30522, 1024, 1024] + - [115, 9332.0] + - - [30522, 80, 1, 1024, 30522, 30522, 1024, 1024] + - [115, 6417.0] + - - [64, 128, 512, 128, 64, 64, 128, 128] + - [110, 7239.0] + - - [64, 512, 64, 512, 64, 64, 512, 512] + - [114, 6809.0] + - - [64, 64, 768, 64, 64, 64, 64, 64] + - [177, 5265.0] + - - [64, 64, 96, 64, 64, 64, 64, 64] + - [92, 4072.0] + - - [1856, 448, 1, 3328, 1856, 1856, 3328, 3328] + - [132, 8582.0] + - - [128, 6784, 1, 3328, 128, 128, 3328, 3328] + - [115, 8963.0] + - - [2048, 400, 1, 512, 2048, 2048, 512, 512] + - [172, 6359.0] + - - [2368, 448, 1, 128, 2368, 2368, 128, 128] + - [140, 6516.0] + - - [256, 4288, 1, 3328, 256, 256, 3328, 3328] + - [99, 8585.0] + - - [704, 1856, 1, 3328, 704, 704, 3328, 3328] + - [113, 8271.0] + - - [448, 1024, 1, 1280, 448, 448, 1280, 1280] + - [148, 6533.0] + - - [256, 1408, 1, 3328, 256, 256, 3328, 3328] + - [146, 6688.0] + - - [704, 1856, 1, 1280, 704, 704, 1280, 1280] + - [115, 8204.0] + - - [128, 5056, 1, 128, 128, 128, 128, 128] + - [140, 5809.0] + - - [2368, 128, 1, 256, 2368, 2368, 256, 256] + - [173, 4936.0] + - - [64, 5056, 1, 256, 64, 64, 256, 256] + - [110, 4972.0] + - - [256, 2944, 1, 256, 256, 256, 256, 256] + - [148, 4555.0] + - - [256, 1856, 1, 1280, 256, 256, 1280, 1280] + - [146, 8157.0] + - - [128, 3584, 1, 1280, 128, 128, 1280, 1280] + - [146, 7991.0] + - - [4288, 256, 1, 256, 4288, 4288, 256, 256] + - [109, 7763.0] + - - [2944, 128, 1, 128, 2944, 2944, 128, 128] + - [139, 4158.0] + - - [5888, 64, 1, 3328, 5888, 5888, 3328, 3328] + - [151, 5194.0] + - - [2944, 256, 1, 3328, 2944, 2944, 3328, 3328] + - [113, 8119.0] + - - [704, 1024, 1, 128, 704, 704, 128, 128] + - [171, 5402.0] + - - [1408, 448, 1, 1280, 1408, 1408, 1280, 1280] + - [113, 8288.0] + - - [1408, 704, 1, 3328, 1408, 1408, 3328, 3328] + - [113, 8076.0] + - - [1408, 256, 1, 1280, 1408, 1408, 1280, 1280] + - [97, 5587.0] + - - [3072, 128, 1, 1024, 3072, 3072, 1024, 1024] + - [178, 6666.0] + - - [2944, 256, 1, 256, 2944, 2944, 256, 256] + - [178, 6713.0] + - - [704, 1408, 1, 3328, 704, 704, 3328, 3328] + - [113, 7874.0] + - - [2944, 256, 1, 128, 2944, 2944, 128, 128] + - [172, 5882.0] + - - [2368, 128, 1, 3328, 2368, 2368, 3328, 3328] + - [113, 7473.0] + - - [2944, 128, 1, 256, 2944, 2944, 256, 256] + - [145, 5538.0] + - - [448, 1408, 1, 256, 448, 448, 256, 256] + - [172, 5859.0] + - - [64, 5056, 1, 3328, 64, 64, 3328, 3328] + - [144, 7371.0] + - - [1024, 448, 1, 128, 1024, 1024, 128, 128] + - [169, 4559.0] + - - [256, 3584, 1, 3328, 256, 256, 3328, 3328] + - [115, 9498.0] + - - [256, 1408, 1, 256, 256, 256, 256, 256] + - [172, 4882.0] + - - [5056, 64, 1, 1280, 5056, 5056, 1280, 1280] + - [110, 7117.0] + - - [1024, 704, 1, 256, 1024, 1024, 256, 256] + - [146, 5889.0] + - - [128, 4288, 1, 128, 128, 128, 128, 128] + - [108, 3789.0] + - - [6784, 64, 1, 128, 6784, 6784, 128, 128] + - [172, 4662.0] + - - [3584, 256, 1, 128, 3584, 3584, 128, 128] + - [140, 6703.0] + - - [5888, 64, 1, 256, 5888, 5888, 256, 256] + - [113, 4993.0] + - - [1856, 256, 1, 1280, 1856, 1856, 1280, 1280] + - [113, 6679.0] + - - [64, 5888, 1, 3328, 64, 64, 3328, 3328] + - [112, 4387.0] + - - [704, 1024, 1, 1280, 704, 704, 1280, 1280] + - [178, 7245.0] + - - [448, 1856, 1, 128, 448, 448, 128, 128] + - [172, 6047.0] + - - [1024, 704, 1, 1280, 1024, 1024, 1280, 1280] + - [113, 7044.0] + - - [128, 5888, 1, 256, 128, 128, 256, 256] + - [180, 4460.0] + - - [704, 704, 1, 3328, 704, 704, 3328, 3328] + - [115, 7414.0] + - - [704, 1408, 1, 1280, 704, 704, 1280, 1280] + - [146, 7969.0] + - - [3584, 256, 1, 3328, 3584, 3584, 3328, 3328] + - [115, 9262.0] + - - [704, 1856, 1, 128, 704, 704, 128, 128] + - [144, 5973.0] + - - [2944, 448, 1, 128, 2944, 2944, 128, 128] + - [140, 7052.0] + - - [128, 2944, 1, 1280, 128, 128, 1280, 1280] + - [178, 6620.0] + - - [448, 2944, 1, 1280, 448, 448, 1280, 1280] + - [100, 8191.0] + - - [3584, 128, 1, 256, 3584, 3584, 256, 256] + - [175, 5576.0] + - - [448, 1408, 1, 3328, 448, 448, 3328, 3328] + - [110, 8303.0] + - - [256, 3584, 1, 256, 256, 256, 256, 256] + - [148, 7358.0] + - - [256, 2944, 1, 3328, 256, 256, 3328, 3328] + - [113, 8459.0] + - - [448, 2368, 1, 128, 448, 448, 128, 128] + - [139, 5620.0] + - - [1408, 704, 1, 256, 1408, 1408, 256, 256] + - [174, 6392.0] + - - [448, 2944, 1, 3328, 448, 448, 3328, 3328] + - [115, 8325.0] + - - [64, 5888, 1, 256, 64, 64, 256, 256] + - [177, 4213.0] + - - [6784, 128, 1, 3328, 6784, 6784, 3328, 3328] + - [100, 8845.0] + - - [704, 704, 1, 256, 704, 704, 256, 256] + - [162, 4682.0] + - - [128, 4288, 1, 3328, 128, 128, 3328, 3328] + - [143, 7811.0] + - - [448, 704, 1, 1280, 448, 448, 1280, 1280] + - [175, 6877.0] + - - [128, 5056, 1, 1280, 128, 128, 1280, 1280] + - [115, 8683.0] + - - [1024, 448, 1, 3328, 1024, 1024, 3328, 3328] + - [146, 8422.0] + - - [1856, 704, 1, 1280, 1856, 1856, 1280, 1280] + - [180, 8152.0] + - - [448, 1024, 1, 128, 448, 448, 128, 128] + - [127, 5097.0] + - - [448, 2368, 1, 3328, 448, 448, 3328, 3328] + - [132, 8335.0] + - - [5056, 64, 1, 128, 5056, 5056, 128, 128] + - [92, 4226.0] + - - [1024, 700, 1, 512, 1024, 1024, 512, 512] + - [146, 6191.0] + - - [704, 1024, 1, 256, 704, 704, 256, 256] + - [110, 5938.0] + - - [128, 6784, 1, 1280, 128, 128, 1280, 1280] + - [148, 8777.0] + - - [1856, 256, 1, 256, 1856, 1856, 256, 256] + - [173, 6231.0] + - - [256, 4288, 1, 1280, 256, 256, 1280, 1280] + - [178, 8799.0] + - - [256, 1856, 1, 128, 256, 256, 128, 128] + - [180, 3673.0] + - - [7680, 64, 1, 2560, 7680, 7680, 2560, 2560] + - [138, 4895.0] + - - [448, 1408, 1, 128, 448, 448, 128, 128] + - [158, 4899.0] + - - [6784, 128, 1, 256, 6784, 6784, 256, 256] + - [180, 7420.0] + - - [704, 448, 1, 256, 704, 704, 256, 256] + - [110, 4823.0] + - - [704, 1408, 1, 128, 704, 704, 128, 128] + - [172, 5726.0] + - - [4288, 128, 1, 1280, 4288, 4288, 1280, 1280] + - [174, 7913.0] + - - [128, 2944, 1, 128, 128, 128, 128, 128] + - [140, 3665.0] + - - [1024, 704, 1, 3328, 1024, 1024, 3328, 3328] + - [113, 7729.0] + - - [128, 4288, 1, 256, 128, 128, 256, 256] + - [108, 6168.0] + - - [704, 448, 1, 3328, 704, 704, 3328, 3328] + - [175, 7441.0] + - - [448, 2368, 1, 1280, 448, 448, 1280, 1280] + - [115, 8142.0] + - - [64, 6784, 1, 3328, 64, 64, 3328, 3328] + - [175, 7067.0] + - - [2944, 256, 1, 1280, 2944, 2944, 1280, 1280] + - [146, 7804.0] + - - [256, 2368, 1, 128, 256, 256, 128, 128] + - [93, 5187.0] + - - [1856, 704, 1, 256, 1856, 1856, 256, 256] + - [143, 6793.0] + - - [1408, 448, 1, 3328, 1408, 1408, 3328, 3328] + - [113, 8827.0] + - - [1856, 448, 1, 1280, 1856, 1856, 1280, 1280] + - [143, 8564.0] + - - [128, 5888, 1, 128, 128, 128, 128, 128] + - [172, 4572.0] + - - [704, 1856, 1, 256, 704, 704, 256, 256] + - [174, 7400.0] + - - [256, 2368, 1, 1280, 256, 256, 1280, 1280] + - [109, 8709.0] + - - [2944, 448, 1, 256, 2944, 2944, 256, 256] + - [113, 7503.0] + - - [1856, 448, 1, 128, 1856, 1856, 128, 128] + - [143, 5492.0] + - - [2368, 128, 1, 1280, 2368, 2368, 1280, 1280] + - [148, 7365.0] + - - [64, 6784, 1, 256, 64, 64, 256, 256] + - [175, 2579.0] + - - [64, 5056, 1, 1280, 64, 64, 1280, 1280] + - [110, 7087.0] + - - [2368, 256, 1, 1280, 2368, 2368, 1280, 1280] + - [174, 8544.0] + - - [2368, 448, 1, 1280, 2368, 2368, 1280, 1280] + - [113, 8481.0] + - - [128, 3584, 1, 256, 128, 128, 256, 256] + - [178, 4382.0] + - - [704, 448, 1, 1280, 704, 704, 1280, 1280] + - [175, 6915.0] + - - [128, 3584, 1, 3328, 128, 128, 3328, 3328] + - [113, 8420.0] + - - [4288, 256, 1, 1280, 4288, 4288, 1280, 1280] + - [109, 8123.0] + - - [4288, 128, 1, 3328, 4288, 4288, 3328, 3328] + - [143, 7760.0] + - - [7680, 128, 1, 2560, 7680, 7680, 2560, 2560] + - [180, 6210.0] + - - [1408, 256, 1, 128, 1408, 1408, 128, 128] + - [139, 3813.0] + - - [256, 1408, 1, 1280, 256, 256, 1280, 1280] + - [113, 6276.0] + - - [128, 2368, 1, 256, 128, 128, 256, 256] + - [113, 5488.0] + - - [6784, 64, 1, 3328, 6784, 6784, 3328, 3328] + - [113, 7608.0] + - - [128, 2944, 1, 3328, 128, 128, 3328, 3328] + - [163, 6705.0] + - - [2944, 448, 1, 3328, 2944, 2944, 3328, 3328] + - [178, 8265.0] + - - [256, 4288, 1, 256, 256, 256, 256, 256] + - [180, 6935.0] + - - [5888, 128, 1, 256, 5888, 5888, 256, 256] + - [180, 6536.0] + - - [5056, 64, 1, 256, 5056, 5056, 256, 256] + - [110, 4990.0] + - - [1024, 704, 1, 128, 1024, 1024, 128, 128] + - [108, 6980.0] + - - [128, 5056, 1, 3328, 128, 128, 3328, 3328] + - [180, 9021.0] + - - [4288, 128, 1, 256, 4288, 4288, 256, 256] + - [142, 4595.0] + - - [1408, 448, 1, 128, 1408, 1408, 128, 128] + - [140, 5902.0] + - - [704, 448, 1, 128, 704, 704, 128, 128] + - [141, 4205.0] + - - [3584, 256, 1, 256, 3584, 3584, 256, 256] + - [132, 8099.0] + - - [128, 2944, 1, 256, 128, 128, 256, 256] + - [146, 4762.0] + - - [128, 6784, 1, 128, 128, 128, 128, 128] + - [174, 6632.0] + - - [448, 1856, 1, 256, 448, 448, 256, 256] + - [175, 4779.0] + - - [3584, 128, 1, 3328, 3584, 3584, 3328, 3328] + - [146, 8478.0] + - - [5888, 128, 1, 3328, 5888, 5888, 3328, 3328] + - [151, 6569.0] + - - [1408, 704, 1, 1280, 1408, 1408, 1280, 1280] + - [146, 7967.0] + - - [6784, 64, 1, 256, 6784, 6784, 256, 256] + - [178, 5497.0] + - - [448, 2944, 1, 256, 448, 448, 256, 256] + - [148, 7221.0] + - - [448, 2368, 1, 256, 448, 448, 256, 256] + - [180, 7449.0] + - - [64, 6784, 1, 1280, 64, 64, 1280, 1280] + - [110, 7101.0] + - - [128, 2368, 1, 3328, 128, 128, 3328, 3328] + - [148, 7970.0] + - - [5056, 64, 1, 3328, 5056, 5056, 3328, 3328] + - [110, 7180.0] + - - [64, 5888, 1, 128, 64, 64, 128, 128] + - [179, 2722.0] + - - [5056, 128, 1, 3328, 5056, 5056, 3328, 3328] + - [115, 9214.0] + - - [448, 704, 1, 256, 448, 448, 256, 256] + - [175, 4603.0] + - - [2944, 128, 1, 3328, 2944, 2944, 3328, 3328] + - [113, 6954.0] + - - [704, 704, 1, 128, 704, 704, 128, 128] + - [156, 4571.0] + - - [2368, 128, 1, 128, 2368, 2368, 128, 128] + - [171, 3967.0] + - - [5056, 128, 1, 128, 5056, 5056, 128, 128] + - [108, 5643.0] + - - [448, 1024, 1, 3328, 448, 448, 3328, 3328] + - [110, 7868.0] + - - [2368, 256, 1, 256, 2368, 2368, 256, 256] + - [143, 6771.0] + - - [256, 2368, 1, 3328, 256, 256, 3328, 3328] + - [115, 9005.0] + - - [256, 3584, 1, 128, 256, 256, 128, 128] + - [140, 6267.0] + - - [4288, 256, 1, 128, 4288, 4288, 128, 128] + - [159, 6295.0] + - - [2368, 256, 1, 128, 2368, 2368, 128, 128] + - [169, 5344.0] + - - [256, 1856, 1, 256, 256, 256, 256, 256] + - [113, 6162.0] + - - [256, 2944, 1, 128, 256, 256, 128, 128] + - [140, 4771.0] + - - [1408, 256, 1, 3328, 1408, 1408, 3328, 3328] + - [146, 6658.0] + - - [2368, 448, 1, 256, 2368, 2368, 256, 256] + - [143, 7087.0] + - - [4288, 256, 1, 3328, 4288, 4288, 3328, 3328] + - [115, 8180.0] + - - [1856, 704, 1, 128, 1856, 1856, 128, 128] + - [174, 6462.0] + - - [4288, 128, 1, 128, 4288, 4288, 128, 128] + - [107, 5259.0] + - - [1408, 448, 1, 256, 1408, 1408, 256, 256] + - [146, 5615.0] + - - [6784, 64, 1, 1280, 6784, 6784, 1280, 1280] + - [146, 7458.0] + - - [3584, 128, 1, 128, 3584, 3584, 128, 128] + - [140, 4902.0] + - - [256, 2368, 1, 256, 256, 256, 256, 256] + - [146, 4844.0] + - - [2944, 448, 1, 1280, 2944, 2944, 1280, 1280] + - [174, 7940.0] + - - [448, 1408, 1, 1280, 448, 448, 1280, 1280] + - [110, 7407.0] + - - [448, 1856, 1, 1280, 448, 448, 1280, 1280] + - [148, 8298.0] + - - [1856, 256, 1, 128, 1856, 1856, 128, 128] + - [169, 5001.0] + - - [2560, 128, 1, 2560, 2560, 2560, 2560, 2560] + - [180, 7373.0] + - - [448, 1024, 1, 256, 448, 448, 256, 256] + - [110, 5837.0] + - - [1024, 448, 1, 1280, 1024, 1024, 1280, 1280] + - [113, 7955.0] + - - [128, 5056, 1, 256, 128, 128, 256, 256] + - [140, 6863.0] + - - [448, 2944, 1, 128, 448, 448, 128, 128] + - [179, 5128.0] + - - [128, 3584, 1, 128, 128, 128, 128, 128] + - [146, 4016.0] + - - [1408, 256, 1, 256, 1408, 1408, 256, 256] + - [113, 4708.0] + - - [128, 5888, 1, 3328, 128, 128, 3328, 3328] + - [163, 5879.0] + - - [2368, 448, 1, 3328, 2368, 2368, 3328, 3328] + - [113, 8549.0] + - - [128, 5888, 1, 1280, 128, 128, 1280, 1280] + - [178, 7365.0] + - - [64, 5056, 1, 128, 64, 64, 128, 128] + - [139, 4907.0] + - - [64, 6784, 1, 128, 64, 64, 128, 128] + - [169, 4185.0] + - - [448, 704, 1, 128, 448, 448, 128, 128] + - [156, 4086.0] + - - [1408, 704, 1, 128, 1408, 1408, 128, 128] + - [172, 6770.0] + - - [2368, 256, 1, 3328, 2368, 2368, 3328, 3328] + - [174, 8520.0] + - - [5888, 128, 1, 1280, 5888, 5888, 1280, 1280] + - [113, 8066.0] + - - [256, 3584, 1, 1280, 256, 256, 1280, 1280] + - [115, 9259.0] + - - [256, 1408, 1, 128, 256, 256, 128, 128] + - [124, 4377.0] + - - [256, 4288, 1, 128, 256, 256, 128, 128] + - [157, 4613.0] + - - [5888, 128, 1, 128, 5888, 5888, 128, 128] + - [140, 5444.0] + - - [1856, 256, 1, 3328, 1856, 1856, 3328, 3328] + - [113, 8606.0] + - - [64, 5888, 1, 1280, 64, 64, 1280, 1280] + - [175, 6267.0] + - - [704, 704, 1, 1280, 704, 704, 1280, 1280] + - [143, 7149.0] + - - [128, 2368, 1, 1280, 128, 128, 1280, 1280] + - [115, 7354.0] + - - [3584, 256, 1, 1280, 3584, 3584, 1280, 1280] + - [148, 9243.0] + - - [5888, 64, 1, 1280, 5888, 5888, 1280, 1280] + - [146, 6564.0] + - - [3584, 128, 1, 1280, 3584, 3584, 1280, 1280] + - [113, 6668.0] + - - [5056, 128, 1, 1280, 5056, 5056, 1280, 1280] + - [113, 8259.0] + - - [448, 1856, 1, 3328, 448, 448, 3328, 3328] + - [100, 8593.0] + - - [1024, 448, 1, 256, 1024, 1024, 256, 256] + - [178, 5854.0] + - - [2944, 128, 1, 1280, 2944, 2944, 1280, 1280] + - [113, 6566.0] + - - [128, 2368, 1, 128, 128, 128, 128, 128] + - [161, 4000.0] + - - [256, 2944, 1, 1280, 256, 256, 1280, 1280] + - [113, 8171.0] + - - [704, 1024, 1, 3328, 704, 704, 3328, 3328] + - [143, 7792.0] + - - [128, 6784, 1, 256, 128, 128, 256, 256] + - [180, 6964.0] + - - [256, 1856, 1, 3328, 256, 256, 3328, 3328] + - [113, 8635.0] + - - [6784, 128, 1, 128, 6784, 6784, 128, 128] + - [143, 6196.0] + - - [704, 1408, 1, 256, 704, 704, 256, 256] + - [174, 6814.0] + - - [4096, 128, 1, 4096, 4096, 4096, 4096, 4096] + - [180, 7361.0] + - - [5888, 64, 1, 128, 5888, 5888, 128, 128] + - [177, 3986.0] + - - [5056, 128, 1, 256, 5056, 5056, 256, 256] + - [174, 7147.0] + - - [6784, 128, 1, 1280, 6784, 6784, 1280, 1280] + - [109, 8824.0] + - - [1856, 448, 1, 256, 1856, 1856, 256, 256] + - [171, 5896.0] + - - [128, 4288, 1, 1280, 128, 128, 1280, 1280] + - [109, 7903.0] + - - [448, 704, 1, 3328, 448, 448, 3328, 3328] + - [110, 7586.0] + - - [1856, 704, 1, 3328, 1856, 1856, 3328, 3328] + - [113, 8238.0] + - - [1024, 1024, 1, 3328, 1024, 1024, 3328, 3328] + - [113, 8423.0] + - - [2048, 200, 1, 3200, 2048, 2048, 3200, 3200] + - [159, 6188.0] + - - [2048, 256, 1, 3328, 2048, 2048, 3328, 3328] + - [100, 7793.0] + - - [4096, 200, 1, 11264, 4096, 4096, 11264, 11264] + - [180, 3154.0] + - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] + - [172, 7648.0] + - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] + - [157, 4942.0] + - - [512, 1024, 1, 1536, 512, 512, 1536, 1536] + - [109, 7687.0] + - - [1024, 512, 1, 512, 1024, 1024, 512, 512] + - [174, 6674.0] + - - [2048, 512, 1, 640, 2048, 2048, 640, 640] + - [131, 8196.0] + - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] + - [113, 8049.0] + - - [2048, 256, 1, 2048, 2048, 2048, 2048, 2048] + - [109, 7384.0] + - - [1024, 512, 1, 128, 1024, 1024, 128, 128] + - [143, 5053.0] + - - [2048, 512, 1, 256, 2048, 2048, 256, 256] + - [109, 7566.0] + - - [4096, 200, 1, 2560, 4096, 4096, 2560, 2560] + - [143, 6488.0] + - - [1024, 1024, 1, 1152, 1024, 1024, 1152, 1152] + - [178, 8278.0] + - - [2048, 200, 1, 32, 2048, 2048, 32, 32] + - [170, 1928.0] + - - [512, 1024, 1, 2816, 512, 512, 2816, 2816] + - [100, 7739.0] + - - [2048, 200, 1, 2080, 2048, 2048, 2080, 2080] + - [159, 6553.0] + - - [2048, 200, 1, 1024, 2048, 2048, 1024, 1024] + - [143, 5133.0] + - - [4096, 200, 1, 4096, 4096, 4096, 4096, 4096] + - [109, 6386.0] + - - [1024, 512, 1, 11264, 1024, 1024, 11264, 11264] + - [180, 7685.0] + - - [1024, 1024, 1, 1792, 1024, 1024, 1792, 1792] + - [165, 8218.0] + - - [4096, 200, 1, 768, 4096, 4096, 768, 768] + - [174, 6335.0] + - - [4096, 256, 1, 1024, 4096, 4096, 1024, 1024] + - [113, 8243.0] + - - [1024, 512, 1, 256, 1024, 1024, 256, 256] + - [174, 5965.0] + - - [1024, 512, 1, 1408, 1024, 1024, 1408, 1408] + - [109, 7885.0] + - - [1024, 512, 1, 5632, 1024, 1024, 5632, 5632] + - [180, 8038.0] + - - [4096, 200, 1, 256, 4096, 4096, 256, 256] + - [180, 4702.0] + - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] + - [180, 7883.0] + - - [1024, 1024, 1, 4160, 1024, 1024, 4160, 4160] + - [95, 8741.0] + - - [2048, 256, 1, 384, 2048, 2048, 384, 384] + - [159, 6834.0] + - - [4096, 200, 1, 640, 4096, 4096, 640, 640] + - [109, 6471.0] + - - [1024, 1024, 1, 7168, 1024, 1024, 7168, 7168] + - [178, 8352.0] + - - [4096, 256, 1, 768, 4096, 4096, 768, 768] + - [109, 7470.0] + - - [2048, 256, 1, 6656, 2048, 2048, 6656, 6656] + - [115, 7795.0] + - - [2048, 200, 1, 3072, 2048, 2048, 3072, 3072] + - [109, 5802.0] + - - [1024, 512, 1, 2816, 1024, 1024, 2816, 2816] + - [109, 8019.0] + - - [4096, 256, 1, 7680, 4096, 4096, 7680, 7680] + - [148, 7061.0] + - - [4096, 200, 1, 1024, 4096, 4096, 1024, 1024] + - [113, 6559.0] + - - [2048, 200, 1, 1792, 2048, 2048, 1792, 1792] + - [174, 6113.0] + - - [1024, 1024, 1, 2816, 1024, 1024, 2816, 2816] + - [113, 8306.0] + - - [2048, 512, 1, 1536, 2048, 2048, 1536, 1536] + - [165, 8039.0] + - - [4096, 256, 1, 3072, 4096, 4096, 3072, 3072] + - [178, 8332.0] + - - [2048, 256, 1, 5632, 2048, 2048, 5632, 5632] + - [115, 7745.0] + - - [1024, 512, 1, 6656, 1024, 1024, 6656, 6656] + - [115, 7810.0] + - - [4096, 200, 1, 2080, 4096, 4096, 2080, 2080] + - [125, 7300.0] + - - [2048, 200, 1, 13312, 2048, 2048, 13312, 13312] + - [118, 4538.0] + - - [4096, 256, 1, 3584, 4096, 4096, 3584, 3584] + - [115, 7425.0] + - - [2048, 256, 1, 8192, 2048, 2048, 8192, 8192] + - [104, 6424.0] + - - [2048, 512, 1, 512, 2048, 2048, 512, 512] + - [172, 7401.0] + - - [2048, 512, 1, 1152, 2048, 2048, 1152, 1152] + - [113, 8347.0] + - - [2048, 200, 1, 9216, 2048, 2048, 9216, 9216] + - [120, 4129.0] + - - [2048, 200, 1, 2560, 2048, 2048, 2560, 2560] + - [109, 5781.0] + - - [2048, 256, 1, 4608, 2048, 2048, 4608, 4608] + - [109, 7803.0] + - - [2048, 256, 1, 3584, 2048, 2048, 3584, 3584] + - [115, 7533.0] + - - [1024, 512, 1, 640, 1024, 1024, 640, 640] + - [109, 7207.0] + - - [2048, 512, 1, 768, 2048, 2048, 768, 768] + - [100, 7860.0] + - - [2048, 200, 1, 1408, 2048, 2048, 1408, 1408] + - [174, 6087.0] + - - [4096, 200, 1, 2048, 4096, 4096, 2048, 2048] + - [143, 6366.0] + - - [1024, 1024, 1, 5632, 1024, 1024, 5632, 5632] + - [113, 8654.0] + - - [2048, 512, 1, 3584, 2048, 2048, 3584, 3584] + - [113, 8439.0] + - - [1024, 512, 1, 64, 1024, 1024, 64, 64] + - [174, 3517.0] + - - [4096, 200, 1, 7680, 4096, 4096, 7680, 7680] + - [115, 6341.0] + - - [1024, 1024, 1, 1280, 1024, 1024, 1280, 1280] + - [113, 7793.0] + - - [2048, 200, 1, 896, 2048, 2048, 896, 896] + - [143, 5881.0] + - - [2048, 256, 1, 32, 2048, 2048, 32, 32] + - [121, 2363.0] + - - [2048, 256, 1, 1280, 2048, 2048, 1280, 1280] + - [115, 7544.0] + - - [4096, 256, 1, 4096, 4096, 4096, 4096, 4096] + - [148, 7844.0] + - - [2048, 256, 1, 11264, 2048, 2048, 11264, 11264] + - [113, 6935.0] + - - [4096, 200, 1, 9216, 4096, 4096, 9216, 9216] + - [115, 6235.0] + - - [1024, 512, 1, 4096, 1024, 1024, 4096, 4096] + - [115, 7482.0] + - - [4096, 200, 1, 3840, 4096, 4096, 3840, 3840] + - [180, 6284.0] + - - [1024, 1024, 1, 1920, 1024, 1024, 1920, 1920] + - [143, 8201.0] + - - [2048, 200, 1, 7168, 2048, 2048, 7168, 7168] + - [115, 5458.0] + - - [4096, 256, 1, 1152, 4096, 4096, 1152, 1152] + - [143, 7984.0] + - - [2048, 256, 1, 1920, 2048, 2048, 1920, 1920] + - [95, 7776.0] + - - [2048, 512, 1, 4160, 2048, 2048, 4160, 4160] + - [95, 8759.0] + - - [2048, 512, 1, 5632, 2048, 2048, 5632, 5632] + - [113, 8659.0] + - - [4096, 256, 1, 7168, 4096, 4096, 7168, 7168] + - [115, 7696.0] + - - [4096, 200, 1, 128, 4096, 4096, 128, 128] + - [171, 3603.0] + - - [2048, 200, 1, 5120, 2048, 2048, 5120, 5120] + - [115, 6140.0] + - - [1024, 1024, 1, 6656, 1024, 1024, 6656, 6656] + - [113, 8643.0] + - - [512, 1024, 1, 3200, 512, 512, 3200, 3200] + - [128, 7998.0] + - - [2048, 256, 1, 1536, 2048, 2048, 1536, 1536] + - [109, 7674.0] + - - [4096, 256, 1, 256, 4096, 4096, 256, 256] + - [143, 7354.0] + - - [2048, 512, 1, 1408, 2048, 2048, 1408, 1408] + - [163, 8595.0] + - - [1024, 512, 1, 2080, 1024, 1024, 2080, 2080] + - [93, 8230.0] + - - [2048, 512, 1, 2304, 2048, 2048, 2304, 2304] + - [113, 8563.0] + - - [4096, 200, 1, 512, 4096, 4096, 512, 512] + - [140, 6419.0] + - - [2048, 200, 1, 1280, 2048, 2048, 1280, 1280] + - [109, 5532.0] + - - [1024, 1024, 1, 2304, 1024, 1024, 2304, 2304] + - [113, 8585.0] + - - [2048, 512, 1, 4608, 2048, 2048, 4608, 4608] + - [113, 8545.0] + - - [4096, 256, 1, 6144, 4096, 4096, 6144, 6144] + - [120, 7031.0] + - - [4096, 256, 1, 896, 4096, 4096, 896, 896] + - [159, 7914.0] + - - [2048, 256, 1, 640, 2048, 2048, 640, 640] + - [174, 7633.0] + - - [2048, 512, 1, 384, 2048, 2048, 384, 384] + - [174, 7415.0] + - - [2048, 200, 1, 16384, 2048, 2048, 16384, 16384] + - [120, 5569.0] + - - [4096, 200, 1, 10240, 4096, 4096, 10240, 10240] + - [115, 4806.0] + - - [1024, 512, 1, 9216, 1024, 1024, 9216, 9216] + - [148, 7855.0] + - - [4096, 200, 1, 1920, 4096, 4096, 1920, 1920] + - [143, 6515.0] + - - [2048, 512, 1, 7680, 2048, 2048, 7680, 7680] + - [115, 7383.0] + - - [1024, 512, 1, 3584, 1024, 1024, 3584, 3584] + - [109, 7646.0] + - - [1024, 1024, 1, 32, 1024, 1024, 32, 32] + - [183, 2289.0] + - - [2048, 512, 1, 1664, 2048, 2048, 1664, 1664] + - [131, 8377.0] + - - [2048, 200, 1, 2048, 2048, 2048, 2048, 2048] + - [143, 5724.0] + - - [1024, 1024, 1, 3584, 1024, 1024, 3584, 3584] + - [113, 8729.0] + - - [4096, 256, 1, 6656, 4096, 4096, 6656, 6656] + - [115, 7019.0] + - - [4096, 256, 1, 4160, 4096, 4096, 4160, 4160] + - [143, 8819.0] + - - [2048, 256, 1, 3072, 2048, 2048, 3072, 3072] + - [115, 7903.0] + - - [2048, 256, 1, 8320, 2048, 2048, 8320, 8320] + - [159, 8050.0] + - - [1024, 512, 1, 3200, 1024, 1024, 3200, 3200] + - [128, 7985.0] + - - [1024, 512, 1, 896, 1024, 1024, 896, 896] + - [165, 7587.0] + - - [2048, 512, 1, 1280, 2048, 2048, 1280, 1280] + - [146, 8448.0] + - - [4096, 200, 1, 64, 4096, 4096, 64, 64] + - [153, 3369.0] + - - [1024, 1024, 1, 5120, 1024, 1024, 5120, 5120] + - [178, 8268.0] + - - [2048, 512, 1, 6656, 2048, 2048, 6656, 6656] + - [113, 8769.0] + - - [1024, 1024, 1, 128, 1024, 1024, 128, 128] + - [178, 5716.0] + - - [512, 1024, 1, 1792, 512, 512, 1792, 1792] + - [109, 7832.0] + - - [4096, 256, 1, 2816, 4096, 4096, 2816, 2816] + - [146, 8309.0] + - - [1024, 1024, 1, 4096, 1024, 1024, 4096, 4096] + - [115, 8336.0] + - - [2048, 200, 1, 4160, 2048, 2048, 4160, 4160] + - [128, 6523.0] + - - [1024, 512, 1, 768, 1024, 1024, 768, 768] + - [180, 7219.0] + - - [4096, 200, 1, 8320, 4096, 4096, 8320, 8320] + - [178, 6897.0] + - - [2048, 512, 1, 896, 2048, 2048, 896, 896] + - [109, 8504.0] + - - [4096, 200, 1, 7168, 4096, 4096, 7168, 7168] + - [115, 6388.0] + - - [2048, 200, 1, 3840, 2048, 2048, 3840, 3840] + - [180, 5875.0] + - - [1024, 1024, 1, 768, 1024, 1024, 768, 768] + - [146, 8037.0] + - - [4096, 256, 1, 2304, 4096, 4096, 2304, 2304] + - [178, 8222.0] + - - [2048, 200, 1, 16640, 2048, 2048, 16640, 16640] + - [115, 6013.0] + - - [2048, 256, 1, 2816, 2048, 2048, 2816, 2816] + - [109, 7682.0] + - - [1024, 512, 1, 384, 1024, 1024, 384, 384] + - [172, 6825.0] + - - [2048, 200, 1, 7680, 2048, 2048, 7680, 7680] + - [187, 5562.0] + - - [1024, 512, 1, 4608, 1024, 1024, 4608, 4608] + - [148, 7644.0] + - - [4096, 200, 1, 32, 4096, 4096, 32, 32] + - [176, 2225.0] + - - [4096, 200, 1, 3328, 4096, 4096, 3328, 3328] + - [113, 6699.0] + - - [1024, 1024, 1, 1408, 1024, 1024, 1408, 1408] + - [131, 8529.0] + - - [2048, 200, 1, 15360, 2048, 2048, 15360, 15360] + - [120, 5565.0] + - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] + - [180, 7017.0] + - - [4096, 256, 1, 5632, 4096, 4096, 5632, 5632] + - [166, 5605.0] + - - [2048, 256, 1, 1408, 2048, 2048, 1408, 1408] + - [180, 7585.0] + - - [2048, 256, 1, 6144, 2048, 2048, 6144, 6144] + - [115, 7747.0] + - - [4096, 256, 1, 3328, 4096, 4096, 3328, 3328] + - [178, 8302.0] + - - [2048, 512, 1, 6144, 2048, 2048, 6144, 6144] + - [178, 8548.0] + - - [2048, 512, 1, 3200, 2048, 2048, 3200, 3200] + - [131, 8476.0] + - - [2048, 200, 1, 4608, 2048, 2048, 4608, 4608] + - [109, 6208.0] + - - [1024, 1024, 1, 6144, 1024, 1024, 6144, 6144] + - [178, 8447.0] + - - [4096, 256, 1, 1664, 4096, 4096, 1664, 1664] + - [143, 8223.0] + - - [2048, 200, 1, 384, 2048, 2048, 384, 384] + - [169, 4640.0] + - - [4096, 256, 1, 1792, 4096, 4096, 1792, 1792] + - [146, 8554.0] + - - [2048, 512, 1, 2816, 2048, 2048, 2816, 2816] + - [113, 8283.0] + - - [4096, 256, 1, 384, 4096, 4096, 384, 384] + - [172, 7958.0] + - - [2048, 256, 1, 128, 2048, 2048, 128, 128] + - [177, 5023.0] + - - [1024, 1024, 1, 640, 1024, 1024, 640, 640] + - [180, 8022.0] + - - [4096, 200, 1, 5632, 4096, 4096, 5632, 5632] + - [104, 4522.0] + - - [2048, 200, 1, 1152, 2048, 2048, 1152, 1152] + - [159, 5477.0] + - - [4096, 256, 1, 512, 4096, 4096, 512, 512] + - [178, 7940.0] + - - [1024, 1024, 1, 384, 1024, 1024, 384, 384] + - [95, 8040.0] + - - [2048, 200, 1, 512, 2048, 2048, 512, 512] + - [174, 5293.0] + - - [2048, 256, 1, 9216, 2048, 2048, 9216, 9216] + - [102, 4993.0] + - - [2048, 256, 1, 1792, 2048, 2048, 1792, 1792] + - [174, 7239.0] + - - [4096, 200, 1, 1792, 4096, 4096, 1792, 1792] + - [178, 6692.0] + - - [2048, 200, 1, 1536, 2048, 2048, 1536, 1536] + - [109, 5684.0] + - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] + - [113, 8215.0] + - - [1024, 1024, 1, 2080, 1024, 1024, 2080, 2080] + - [108, 9696.0] + - - [2048, 200, 1, 2304, 2048, 2048, 2304, 2304] + - [109, 6186.0] + - - [2048, 256, 1, 7168, 2048, 2048, 7168, 7168] + - [187, 6574.0] + - - [2048, 512, 1, 1792, 2048, 2048, 1792, 1792] + - [146, 8054.0] + - - [1024, 1024, 1, 4608, 1024, 1024, 4608, 4608] + - [113, 8524.0] + - - [512, 1024, 1, 1280, 512, 512, 1280, 1280] + - [180, 7309.0] + - - [2048, 256, 1, 3200, 2048, 2048, 3200, 3200] + - [95, 8225.0] + - - [1024, 512, 1, 3328, 1024, 1024, 3328, 3328] + - [115, 7924.0] + - - [1024, 512, 1, 4160, 1024, 1024, 4160, 4160] + - [143, 8552.0] + - - [4096, 200, 1, 6656, 4096, 4096, 6656, 6656] + - [146, 6496.0] + - - [2048, 200, 1, 3328, 2048, 2048, 3328, 3328] + - [109, 6071.0] + - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] + - [109, 7279.0] + - - [2048, 256, 1, 64, 2048, 2048, 64, 64] + - [125, 3624.0] + - - [2048, 256, 1, 2304, 2048, 2048, 2304, 2304] + - [100, 7765.0] + - - [4096, 200, 1, 8192, 4096, 4096, 8192, 8192] + - [180, 5305.0] + - - [1024, 512, 1, 7168, 1024, 1024, 7168, 7168] + - [180, 7815.0] + - - [1024, 512, 1, 1792, 1024, 1024, 1792, 1792] + - [180, 7419.0] + - - [4096, 200, 1, 2816, 4096, 4096, 2816, 2816] + - [113, 6677.0] + - - [1024, 1024, 1, 896, 1024, 1024, 896, 896] + - [115, 7962.0] + - - [4096, 256, 1, 5120, 4096, 4096, 5120, 5120] + - [115, 8384.0] + - - [4096, 256, 1, 2048, 4096, 4096, 2048, 2048] + - [146, 8149.0] + - - [2048, 256, 1, 5120, 2048, 2048, 5120, 5120] + - [180, 7672.0] + - - [2048, 256, 1, 7680, 2048, 2048, 7680, 7680] + - [104, 6701.0] + - - [2048, 200, 1, 3584, 2048, 2048, 3584, 3584] + - [109, 6058.0] + - - [1024, 512, 1, 1536, 1024, 1024, 1536, 1536] + - [180, 7523.0] + - - [2048, 200, 1, 64, 2048, 2048, 64, 64] + - [171, 2862.0] + - - [2048, 200, 1, 4096, 2048, 2048, 4096, 4096] + - [115, 6140.0] + - - [1024, 1024, 1, 1536, 1024, 1024, 1536, 1536] + - [109, 7901.0] + - - [4096, 256, 1, 32, 4096, 4096, 32, 32] + - [125, 3893.0] + - - [4096, 256, 1, 1280, 4096, 4096, 1280, 1280] + - [146, 8514.0] + - - [2048, 256, 1, 1024, 2048, 2048, 1024, 1024] + - [146, 7060.0] + - - [1024, 512, 1, 1152, 1024, 1024, 1152, 1152] + - [109, 7779.0] + - - [2048, 512, 1, 3328, 2048, 2048, 3328, 3328] + - [113, 8359.0] + - - [4096, 200, 1, 3584, 4096, 4096, 3584, 3584] + - [163, 6415.0] + - - [2048, 200, 1, 256, 2048, 2048, 256, 256] + - [178, 4496.0] + - - [4096, 256, 1, 1920, 4096, 4096, 1920, 1920] + - [146, 8635.0] + - - [2048, 256, 1, 1664, 2048, 2048, 1664, 1664] + - [174, 7878.0] + - - [4096, 200, 1, 5120, 4096, 4096, 5120, 5120] + - [148, 6588.0] + - - [1024, 512, 1, 8192, 1024, 1024, 8192, 8192] + - [148, 7069.0] + - - [4096, 200, 1, 896, 4096, 4096, 896, 896] + - [163, 6552.0] + - - [2048, 200, 1, 640, 2048, 2048, 640, 640] + - [174, 5711.0] + - - [4096, 200, 1, 1408, 4096, 4096, 1408, 1408] + - [143, 6740.0] + - - [2048, 200, 1, 5632, 2048, 2048, 5632, 5632] + - [109, 6255.0] + - - [1024, 512, 1, 2560, 1024, 1024, 2560, 2560] + - [180, 7774.0] + - - [4096, 200, 1, 1280, 4096, 4096, 1280, 1280] + - [143, 6194.0] + - - [1024, 1024, 1, 2560, 1024, 1024, 2560, 2560] + - [113, 8486.0] + - - [2048, 512, 1, 64, 2048, 2048, 64, 64] + - [140, 4821.0] + - - [2048, 200, 1, 8192, 2048, 2048, 8192, 8192] + - [120, 5238.0] + - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] + - [146, 8096.0] + - - [4096, 256, 1, 640, 4096, 4096, 640, 640] + - [174, 8146.0] + - - [2048, 256, 1, 4096, 2048, 2048, 4096, 4096] + - [115, 7775.0] + - - [4096, 200, 1, 1664, 4096, 4096, 1664, 1664] + - [146, 6666.0] + - - [2048, 200, 1, 6656, 2048, 2048, 6656, 6656] + - [109, 6260.0] + - - [512, 1024, 1, 768, 512, 512, 768, 768] + - [148, 7232.0] + - - [2048, 200, 1, 8320, 2048, 2048, 8320, 8320] + - [100, 5678.0] + - - [4096, 256, 1, 3840, 4096, 4096, 3840, 3840] + - [180, 7698.0] + - - [1024, 1024, 1, 3200, 1024, 1024, 3200, 3200] + - [109, 8469.0] + - - [4096, 256, 1, 4608, 4096, 4096, 4608, 4608] + - [115, 6176.0] + - - [1024, 512, 1, 32, 1024, 1024, 32, 32] + - [185, 2512.0] + - - [1024, 512, 1, 3840, 1024, 1024, 3840, 3840] + - [115, 7541.0] + - - [2048, 512, 1, 1920, 2048, 2048, 1920, 1920] + - [99, 8621.0] + - - [4096, 200, 1, 6144, 4096, 4096, 6144, 6144] + - [113, 6080.0] + - - [2048, 200, 1, 2816, 2048, 2048, 2816, 2816] + - [109, 5957.0] + - - [1024, 1024, 1, 3840, 1024, 1024, 3840, 3840] + - [113, 8543.0] + - - [2048, 256, 1, 3840, 2048, 2048, 3840, 3840] + - [165, 7582.0] + - - [1024, 512, 1, 7680, 1024, 1024, 7680, 7680] + - [115, 7880.0] + - - [2048, 200, 1, 10240, 2048, 2048, 10240, 10240] + - [152, 4376.0] + - - [2048, 512, 1, 5120, 2048, 2048, 5120, 5120] + - [115, 8223.0] + - - [512, 1024, 1, 512, 512, 512, 512, 512] + - [180, 6618.0] + - - [2048, 512, 1, 32, 2048, 2048, 32, 32] + - [154, 3303.0] + - - [4096, 256, 1, 2560, 4096, 4096, 2560, 2560] + - [146, 8297.0] + - - [4096, 256, 1, 64, 4096, 4096, 64, 64] + - [174, 4863.0] + - - [2048, 200, 1, 768, 2048, 2048, 768, 768] + - [143, 5736.0] + - - [2048, 512, 1, 2560, 2048, 2048, 2560, 2560] + - [113, 8641.0] + - - [2048, 512, 1, 7168, 2048, 2048, 7168, 7168] + - [115, 7416.0] + - - [2048, 512, 1, 128, 2048, 2048, 128, 128] + - [95, 6416.0] + - - [4096, 200, 1, 2304, 4096, 4096, 2304, 2304] + - [146, 6727.0] + - - [2048, 512, 1, 4096, 2048, 2048, 4096, 4096] + - [115, 8102.0] + - - [2048, 256, 1, 2560, 2048, 2048, 2560, 2560] + - [180, 7490.0] + - - [2048, 256, 1, 4160, 2048, 2048, 4160, 4160] + - [95, 8639.0] + - - [1024, 512, 1, 1664, 1024, 1024, 1664, 1664] + - [109, 7976.0] + - - [2048, 512, 1, 2080, 2048, 2048, 2080, 2080] + - [125, 9637.0] + - - [2048, 512, 1, 3840, 2048, 2048, 3840, 3840] + - [113, 8390.0] + - - [4096, 200, 1, 3072, 4096, 4096, 3072, 3072] + - [178, 6507.0] + - - [1024, 1024, 1, 1664, 1024, 1024, 1664, 1664] + - [131, 8081.0] + - - [512, 1024, 1, 2304, 512, 512, 2304, 2304] + - [180, 7755.0] + - - [4096, 256, 1, 1408, 4096, 4096, 1408, 1408] + - [99, 8408.0] + - - [2048, 256, 1, 1152, 2048, 2048, 1152, 1152] + - [143, 7775.0] + - - [1024, 512, 1, 1280, 1024, 1024, 1280, 1280] + - [109, 7652.0] + - - [2048, 200, 1, 12288, 2048, 2048, 12288, 12288] + - [118, 4556.0] + - - [2048, 200, 1, 1664, 2048, 2048, 1664, 1664] + - [128, 5831.0] + - - [4096, 200, 1, 4608, 4096, 4096, 4608, 4608] + - [132, 6286.0] + - - [512, 1024, 1, 2560, 512, 512, 2560, 2560] + - [109, 7438.0] + - - [4096, 200, 1, 384, 4096, 4096, 384, 384] + - [172, 5488.0] + - - [2048, 200, 1, 128, 2048, 2048, 128, 128] + - [157, 3936.0] + - - [2048, 200, 1, 11264, 2048, 2048, 11264, 11264] + - [149, 4156.0] + - - [1024, 512, 1, 1920, 1024, 1024, 1920, 1920] + - [128, 7584.0] + - - [4096, 256, 1, 1536, 4096, 4096, 1536, 1536] + - [113, 8363.0] + - - [2048, 256, 1, 256, 2048, 2048, 256, 256] + - [143, 6095.0] + - - [2048, 256, 1, 10240, 2048, 2048, 10240, 10240] + - [186, 4864.0] + - - [1024, 512, 1, 5120, 1024, 1024, 5120, 5120] + - [148, 7609.0] + - - [1024, 512, 1, 8320, 1024, 1024, 8320, 8320] + - [159, 8323.0] + - - [1024, 512, 1, 10240, 1024, 1024, 10240, 10240] + - [148, 7901.0] + - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] + - [178, 7884.0] + - - [2048, 256, 1, 2080, 2048, 2048, 2080, 2080] + - [159, 8501.0] + - - [4096, 256, 1, 128, 4096, 4096, 128, 128] + - [180, 6331.0] + - - [2048, 256, 1, 896, 2048, 2048, 896, 896] + - [143, 7643.0] + - - [4096, 200, 1, 1152, 4096, 4096, 1152, 1152] + - [172, 6627.0] + - - [2048, 200, 1, 6144, 2048, 2048, 6144, 6144] + - [180, 6051.0] + - - [1024, 1024, 1, 7680, 1024, 1024, 7680, 7680] + - [115, 8314.0] + - - [2048, 200, 1, 1920, 2048, 2048, 1920, 1920] + - [174, 6039.0] + - - [4096, 256, 1, 2080, 4096, 4096, 2080, 2080] + - [93, 9466.0] + - - [2048, 200, 1, 14336, 2048, 2048, 14336, 14336] + - [120, 5381.0] + - - [1024, 512, 1, 6144, 1024, 1024, 6144, 6144] + - [115, 7714.0] + - - [1024, 512, 1, 2304, 1024, 1024, 2304, 2304] + - [165, 7736.0] + - - [4096, 200, 1, 4160, 4096, 4096, 4160, 4160] + - [143, 6872.0] + - - [4096, 200, 1, 1536, 4096, 4096, 1536, 1536] + - [174, 6321.0] + - - [2048, 320, 1, 64, 2048, 2048, 64, 64] + - [140, 4010.0] + - - [2048, 384, 1, 64, 2048, 2048, 64, 64] + - [140, 4766.0] + - - [1024, 384, 1, 289, 1024, 1024, 289, 289] + - [157, 5894.0] + - - [2048, 448, 1, 64, 2048, 2048, 64, 64] + - [93, 5300.0] + - - [102, 101, 624, 64, 102, 102, 64, 64] + - [156, 4482.0] + - - [101, 101, 624, 64, 101, 101, 64, 64] + - [92, 4500.0] + - - [85, 85, 752, 64, 85, 85, 64, 64] + - [122, 4038.0] + - - [112, 111, 576, 64, 112, 112, 64, 64] + - [161, 5355.0] + - - [65, 65, 992, 64, 65, 65, 64, 64] + - [155, 2652.0] + - - [77, 77, 816, 64, 77, 77, 64, 64] + - [122, 3555.0] + - - [111, 111, 576, 64, 111, 111, 64, 64] + - [97, 5034.0] + - - [84, 85, 752, 64, 84, 84, 64, 64] + - [155, 4083.0] + - - [84, 84, 752, 64, 84, 84, 64, 64] + - [155, 4048.0] + - - [71, 71, 896, 64, 71, 71, 64, 64] + - [122, 3113.0] + - - [122, 122, 528, 64, 122, 122, 64, 64] + - [162, 5365.0] + - - [78, 78, 816, 64, 78, 78, 64, 64] + - [122, 3641.0] + - - [112, 112, 576, 64, 112, 112, 64, 64] + - [161, 5727.0] + - - [77, 78, 816, 64, 77, 77, 64, 64] + - [90, 3519.0] + - - [111, 112, 576, 64, 111, 111, 64, 64] + - [161, 5144.0] + - - [92, 93, 688, 64, 92, 92, 64, 64] + - [90, 4418.0] + - - [102, 102, 624, 64, 102, 102, 64, 64] + - [92, 4587.0] + - - [99, 99, 624, 64, 99, 99, 64, 64] + - [161, 4382.0] + - - [100, 102, 624, 64, 100, 100, 64, 64] + - [124, 4525.0] + - - [123, 122, 528, 64, 123, 123, 64, 64] + - [130, 5311.0] + - - [99, 102, 624, 64, 99, 99, 64, 64] + - [171, 4439.0] + - - [93, 93, 688, 64, 93, 93, 64, 64] + - [90, 4384.0] + - - [123, 123, 528, 64, 123, 123, 64, 64] + - [162, 5397.0] + - - [100, 100, 624, 64, 100, 100, 64, 64] + - [92, 4434.0] + - - [101, 102, 624, 64, 101, 101, 64, 64] + - [124, 4552.0] + - - [102, 100, 624, 64, 102, 102, 64, 64] + - [161, 4509.0] + - - [92, 92, 688, 64, 92, 92, 64, 64] + - [122, 4393.0] + - - [3072, 128, 1, 4096, 3072, 3072, 4096, 4096] + - [175, 6638.0] + - - [1728, 320, 1, 64, 1728, 1728, 64, 64] + - [154, 4620.0] + - - [1440, 320, 1, 196, 1440, 1440, 196, 196] + - [157, 5652.0] + - - [2592, 384, 1, 289, 2592, 2592, 289, 289] + - [157, 7916.0] + - - [192, 80, 36, 10368, 192, 192, 10368, 10368] + - [148, 4377.0] + - - [1280, 384, 1, 64, 1280, 1280, 64, 64] + - [94, 4150.0] + - - [1280, 448, 1, 64, 1280, 1280, 64, 64] + - [157, 3989.0] + - - [3456, 256, 1, 169, 3456, 3456, 169, 169] + - [180, 7402.0] + - - [2304, 256, 1, 196, 2304, 2304, 196, 196] + - [90, 7420.0] + - - [224, 192, 36, 2592, 224, 224, 2592, 2592] + - [99, 8328.0] + - - [192, 128, 36, 1568, 192, 192, 1568, 1568] + - [95, 7308.0] + - - [1296, 288, 1, 196, 1296, 1296, 196, 196] + - [106, 3399.0] + - - [192, 64, 36, 6272, 192, 192, 6272, 6272] + - [97, 5626.0] + - - [1728, 224, 1, 1225, 1728, 1728, 1225, 1225] + - [93, 7275.0] + - - [1152, 384, 1, 64, 1152, 1152, 64, 64] + - [153, 3954.0] + - - [1792, 256, 1, 289, 1792, 1792, 289, 289] + - [157, 7507.0] + - - [1728, 384, 1, 169, 1728, 1728, 169, 169] + - [122, 6272.0] + - - [1568, 256, 1, 289, 1568, 1568, 289, 289] + - [93, 5949.0] + - - [1152, 448, 1, 64, 1152, 1152, 64, 64] + - [139, 4440.0] + - - [1536, 256, 1, 64, 1536, 1536, 64, 64] + - [137, 3226.0] + - - [1440, 320, 1, 49, 1440, 1440, 49, 49] + - [105, 2520.0] + - - [1344, 512, 1, 64, 1344, 1344, 64, 64] + - [171, 4386.0] + - - [1152, 256, 1, 196, 1152, 1152, 196, 196] + - [147, 5044.0] + - - [1728, 192, 1, 1225, 1728, 1728, 1225, 1225] + - [157, 6345.0] + - - [2048, 512, 1, 49, 2048, 2048, 49, 49] + - [155, 4008.0] + - - [512, 2048, 1, 49, 512, 512, 49, 49] + - [93, 4460.0] + - - [1728, 192, 1, 64, 1728, 1728, 64, 64] + - [89, 3208.0] + - - [1536, 384, 1, 64, 1536, 1536, 64, 64] + - [108, 3916.0] + - - [2048, 192, 1, 64, 2048, 2048, 64, 64] + - [171, 3235.0] + - - [128, 96, 36, 1568, 128, 128, 1568, 1568] + - [143, 6763.0] + - - [128, 128, 36, 3136, 128, 128, 3136, 3136] + - [115, 8875.0] + - - [1280, 320, 1, 64, 1280, 1280, 64, 64] + - [91, 3651.0] + - - [1792, 320, 1, 289, 1792, 1792, 289, 289] + - [172, 7082.0] + - - [2880, 320, 1, 64, 2880, 2880, 64, 64] + - [171, 4136.0] + - - [1728, 384, 1, 49, 1728, 1728, 49, 49] + - [171, 3423.0] + - - [512, 1024, 1, 196, 512, 512, 196, 196] + - [125, 5981.0] + - - [224, 192, 36, 5184, 224, 224, 5184, 5184] + - [99, 8416.0] + - - [192, 80, 36, 20736, 192, 192, 20736, 20736] + - [134, 3319.0] + - - [224, 192, 64, 4608, 224, 224, 4608, 4608] + - [148, 5076.0] + - - [224, 192, 64, 2304, 224, 224, 2304, 2304] + - [148, 6831.0] + - - [192, 80, 49, 14400, 192, 192, 14400, 14400] + - [183, 3186.0] + - - [224, 192, 49, 6272, 224, 224, 6272, 6272] + - [165, 5740.0] + - - [224, 192, 49, 3136, 224, 224, 3136, 3136] + - [163, 8384.0] + - - [192, 80, 36, 41472, 192, 192, 41472, 41472] + - [101, 3554.0] + - - [192, 80, 49, 28800, 192, 192, 28800, 28800] + - [151, 3423.0] + - - [192, 80, 64, 9216, 192, 192, 9216, 9216] + - [96, 2279.0] + - - [256, 224, 9, 9792, 256, 256, 9792, 9792] + - [143, 8185.0] + - - [256, 256, 9, 4896, 256, 256, 4896, 4896] + - [128, 9567.0] + - - [320, 256, 9, 4896, 320, 320, 4896, 4896] + - [128, 8230.0] + - - [224, 192, 9, 19584, 224, 224, 19584, 19584] + - [113, 6220.0] + - - [192, 192, 11, 3264, 192, 192, 3264, 3264] + - [90, 7082.0] + - - [192, 192, 11, 6528, 192, 192, 6528, 6528] + - [129, 6791.0] + - - [192, 192, 9, 4896, 192, 192, 4896, 4896] + - [93, 6774.0] + - - [224, 192, 11, 6528, 224, 224, 6528, 6528] + - [131, 6704.0] + - - [192, 192, 9, 19584, 192, 192, 19584, 19584] + - [163, 6072.0] + - - [256, 224, 11, 13056, 256, 256, 13056, 13056] + - [152, 5098.0] + - - [224, 192, 11, 13056, 224, 224, 13056, 13056] + - [113, 5762.0] + - - [256, 256, 11, 3264, 256, 256, 3264, 3264] + - [125, 7450.0] + - - [320, 256, 11, 6528, 320, 320, 6528, 6528] + - [99, 7484.0] + - - [192, 192, 9, 9792, 192, 192, 9792, 9792] + - [157, 6700.0] + - - [224, 224, 9, 9792, 224, 224, 9792, 9792] + - [128, 7242.0] + - - [224, 192, 11, 3264, 224, 224, 3264, 3264] + - [172, 7031.0] + - - [224, 224, 11, 6528, 224, 224, 6528, 6528] + - [131, 6266.0] + - - [224, 224, 9, 19584, 224, 224, 19584, 19584] + - [180, 6508.0] + - - [192, 192, 11, 13056, 192, 192, 13056, 13056] + - [116, 4774.0] + - - [224, 224, 9, 4896, 224, 224, 4896, 4896] + - [93, 7060.0] + - - [320, 256, 11, 3264, 320, 320, 3264, 3264] + - [122, 7567.0] + - - [256, 256, 11, 6528, 256, 256, 6528, 6528] + - [148, 7676.0] + - - [224, 192, 9, 4896, 224, 224, 4896, 4896] + - [93, 7894.0] + - - [224, 224, 11, 13056, 224, 224, 13056, 13056] + - [148, 5632.0] + - - [224, 224, 11, 3264, 224, 224, 3264, 3264] + - [172, 6769.0] + - - [256, 224, 11, 6528, 256, 256, 6528, 6528] + - [163, 6650.0] + - - [256, 224, 11, 3264, 256, 256, 3264, 3264] + - [157, 7721.0] + - - [224, 192, 9, 9792, 224, 224, 9792, 9792] + - [125, 7448.0] + - - [256, 224, 9, 4896, 256, 256, 4896, 4896] + - [95, 8391.0] + - - [64, 64, 496, 64, 64, 64, 64, 64] + - [164, 3769.0] + - - [135, 135, 32, 64, 135, 135, 64, 64] + - [172, 2610.0] + - - [64, 65, 496, 64, 64, 64, 64, 64] + - [171, 3546.0] + - - [65, 65, 472, 64, 65, 65, 64, 64] + - [155, 2551.0] + - - [65, 65, 496, 64, 65, 65, 64, 64] + - [90, 2556.0] + - - [70, 70, 216, 64, 70, 70, 64, 64] + - [88, 2677.0] + - - [70, 71, 216, 64, 70, 70, 64, 64] + - [153, 2433.0] + - - [71, 71, 216, 64, 71, 71, 64, 64] + - [153, 2638.0] + - - [71, 71, 448, 64, 71, 71, 64, 64] + - [155, 2572.0] + - - [77, 77, 248, 64, 77, 77, 64, 64] + - [154, 2687.0] + - - [77, 77, 408, 64, 77, 77, 64, 64] + - [155, 2768.0] + - - [77, 78, 248, 64, 77, 77, 64, 64] + - [122, 2822.0] + - - [77, 78, 408, 64, 77, 77, 64, 64] + - [168, 3056.0] + - - [78, 78, 248, 64, 78, 78, 64, 64] + - [88, 3040.0] + - - [78, 78, 408, 64, 78, 78, 64, 64] + - [155, 2859.0] + - - [80, 80, 152, 64, 80, 80, 64, 64] + - [155, 2856.0] + - - [80, 84, 152, 64, 80, 80, 64, 64] + - [155, 2942.0] + - - [84, 84, 152, 64, 84, 84, 64, 64] + - [122, 2862.0] + - - [85, 85, 376, 64, 85, 85, 64, 64] + - [155, 3493.0] + - - [93, 93, 344, 64, 93, 93, 64, 64] + - [160, 4046.0] + - - [102, 102, 312, 64, 102, 102, 64, 64] + - [92, 4330.0] + - - [112, 112, 288, 64, 112, 112, 64, 64] + - [97, 5193.0] + - - [122, 122, 264, 64, 122, 122, 64, 64] + - [130, 4935.0] + - - [123, 122, 264, 64, 123, 123, 64, 64] + - [98, 4979.0] + - - [123, 123, 264, 64, 123, 123, 64, 64] + - [98, 4956.0] + - - [511, 2048, 1, 2048, 511, 511, 2048, 2048] + - [146, 8414.0] + - - [1024, 512, 1, 1025, 1024, 1024, 1025, 1025] + - [174, 7543.0] + - - [512, 1023, 1, 1024, 512, 512, 1024, 1024] + - [180, 7205.0] + - - [1025, 1024, 1, 1024, 1025, 1025, 1024, 1024] + - [109, 7979.0] + - - [2048, 513, 1, 2048, 2048, 2048, 2048, 2048] + - [115, 7567.0] + - - [1024, 1024, 1, 1025, 1024, 1024, 1025, 1025] + - [93, 8409.0] + - - [960, 1024, 1, 1023, 960, 960, 1023, 1023] + - [140, 7833.0] + - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] + - [146, 8243.0] + - - [960, 1025, 1, 1024, 960, 960, 1024, 1024] + - [172, 7468.0] + - - [2049, 512, 1, 2048, 2049, 2049, 2048, 2048] + - [115, 7816.0] + - - [513, 1024, 1, 1024, 513, 513, 1024, 1024] + - [109, 7363.0] + - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] + - [146, 8550.0] + - - [1024, 511, 1, 1024, 1024, 1024, 1024, 1024] + - [165, 6968.0] + - - [1024, 512, 1, 1023, 1024, 1024, 1023, 1023] + - [109, 8102.0] + - - [960, 1024, 1, 1025, 960, 960, 1025, 1025] + - [93, 8345.0] + - - [959, 1024, 1, 1024, 959, 959, 1024, 1024] + - [172, 7694.0] + - - [2048, 512, 1, 2049, 2048, 2048, 2049, 2049] + - [95, 8597.0] + - - [511, 1024, 1, 1024, 511, 511, 1024, 1024] + - [115, 7089.0] + - - [512, 2049, 1, 2048, 512, 512, 2048, 2048] + - [146, 8434.0] + - - [1024, 513, 1, 1024, 1024, 1024, 1024, 1024] + - [180, 6979.0] + - - [2048, 512, 1, 2047, 2048, 2048, 2047, 2047] + - [128, 8714.0] + - - [1025, 512, 1, 1024, 1025, 1025, 1024, 1024] + - [143, 6979.0] + - - [1024, 1024, 1, 1023, 1024, 1024, 1023, 1023] + - [108, 8745.0] + - - [513, 2048, 1, 2048, 513, 513, 2048, 2048] + - [148, 8105.0] + - - [1024, 1025, 1, 1024, 1024, 1024, 1024, 1024] + - [148, 7837.0] + - - [512, 2048, 1, 2049, 512, 512, 2049, 2049] + - [93, 8417.0] + - - [1024, 1023, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 7449.0] + - - [960, 1023, 1, 1024, 960, 960, 1024, 1024] + - [146, 7703.0] + - - [2048, 511, 1, 2048, 2048, 2048, 2048, 2048] + - [113, 8014.0] + - - [1023, 512, 1, 1024, 1023, 1023, 1024, 1024] + - [180, 7044.0] + - - [2047, 512, 1, 2048, 2047, 2047, 2048, 2048] + - [113, 8416.0] + - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] + - [148, 7199.0] + - - [512, 1024, 1, 1025, 512, 512, 1025, 1025] + - [93, 7476.0] + - - [512, 2047, 1, 2048, 512, 512, 2048, 2048] + - [146, 8419.0] + - - [512, 1025, 1, 1024, 512, 512, 1024, 1024] + - [180, 7078.0] + - - [512, 2048, 1, 2047, 512, 512, 2047, 2047] + - [128, 8758.0] + - - [960, 1024, 1, 1024, 960, 960, 1024, 1024] + - [146, 7614.0] + - - [961, 1024, 1, 1024, 961, 961, 1024, 1024] + - [178, 7667.0] + - - [512, 1024, 1, 1023, 512, 512, 1023, 1023] + - [95, 7878.0] + - - [1023, 1024, 1, 1024, 1023, 1023, 1024, 1024] + - [172, 7817.0] + - - [479, 1024, 1, 1024, 479, 479, 1024, 1024] + - [109, 6877.0] + - - [479, 2048, 1, 2048, 479, 479, 2048, 2048] + - [146, 7605.0] + - - [480, 1023, 1, 1024, 480, 480, 1024, 1024] + - [109, 6123.0] + - - [480, 1024, 1, 1023, 480, 480, 1023, 1023] + - [95, 7386.0] + - - [480, 1024, 1, 1025, 480, 480, 1025, 1025] + - [95, 7400.0] + - - [480, 1025, 1, 1024, 480, 480, 1024, 1024] + - [109, 7023.0] + - - [480, 2047, 1, 2048, 480, 480, 2048, 2048] + - [146, 8032.0] + - - [480, 2048, 1, 2047, 480, 480, 2047, 2047] + - [93, 8251.0] + - - [480, 2048, 1, 2049, 480, 480, 2049, 2049] + - [108, 8211.0] + - - [480, 2049, 1, 2048, 480, 480, 2048, 2048] + - [109, 7555.0] + - - [480, 3071, 1, 3072, 480, 480, 3072, 3072] + - [115, 9038.0] + - - [481, 1024, 1, 1024, 481, 481, 1024, 1024] + - [180, 6823.0] + - - [481, 2048, 1, 2048, 481, 481, 2048, 2048] + - [113, 7973.0] + - - [1023, 480, 1, 1024, 1023, 1023, 1024, 1024] + - [148, 6692.0] + - - [1024, 479, 1, 1024, 1024, 1024, 1024, 1024] + - [187, 5892.0] + - - [1024, 480, 1, 1023, 1024, 1024, 1023, 1023] + - [128, 7421.0] + - - [1024, 480, 1, 1025, 1024, 1024, 1025, 1025] + - [128, 7287.0] + - - [1024, 481, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 6700.0] + - - [1025, 480, 1, 1024, 1025, 1025, 1024, 1024] + - [143, 7019.0] + - - [2047, 480, 1, 2048, 2047, 2047, 2048, 2048] + - [113, 7775.0] + - - [2048, 479, 1, 2048, 2048, 2048, 2048, 2048] + - [146, 7523.0] + - - [2048, 480, 1, 2047, 2048, 2048, 2047, 2047] + - [131, 8084.0] + - - [2048, 480, 1, 2049, 2048, 2048, 2049, 2049] + - [131, 8079.0] + - - [2048, 481, 1, 2048, 2048, 2048, 2048, 2048] + - [146, 7465.0] + - - [2049, 480, 1, 2048, 2049, 2049, 2048, 2048] + - [109, 7558.0] + - - [3071, 480, 1, 3072, 3071, 3071, 3072, 3072] + - [148, 9003.0] + - - [480, 1024, 1, 1024, 480, 480, 1024, 1024] + - [172, 6526.0] + - - [480, 2048, 1, 2048, 480, 480, 2048, 2048] + - [113, 7584.0] + - - [1024, 480, 1, 1024, 1024, 1024, 1024, 1024] + - [180, 6691.0] + - - [2048, 480, 1, 2048, 2048, 2048, 2048, 2048] + - [113, 7831.0] + - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] + - [148, 7607.0] + - - [1024, 960, 1, 1024, 1024, 1024, 1024, 1024] + - [140, 8617.0] + - - [1024, 960, 1, 1600, 1024, 1024, 1600, 1600] + - [163, 9206.0] + - - [1024, 1024, 1, 960, 1024, 1024, 960, 960] + - [109, 8672.0] + - - [2048, 215, 1, 512, 2048, 2048, 512, 512] + - [143, 5645.0] + - - [2048, 215, 1, 768, 2048, 2048, 768, 768] + - [174, 6091.0] + - - [2048, 256, 1, 512, 2048, 2048, 512, 512] + - [109, 7128.0] + - - [2048, 256, 1, 768, 2048, 2048, 768, 768] + - [100, 6697.0] + - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] + - [146, 8368.0] + - - [2048, 512, 1, 67, 2048, 2048, 67, 67] + - [159, 4019.0] + - - [2048, 512, 1, 74, 2048, 2048, 74, 74] + - [157, 5374.0] + - - [256, 1280, 1, 1024, 256, 256, 1024, 1024] + - [148, 7814.0] + - - [256, 1536, 1, 1024, 256, 256, 1024, 1024] + - [113, 6720.0] + - - [256, 2304, 1, 1024, 256, 256, 1024, 1024] + - [180, 8285.0] + - - [256, 2560, 1, 1024, 256, 256, 1024, 1024] + - [109, 9027.0] + - - [256, 2816, 1, 1024, 256, 256, 1024, 1024] + - [113, 7615.0] + - - [256, 3328, 1, 1024, 256, 256, 1024, 1024] + - [143, 8317.0] + - - [256, 3584, 1, 1024, 256, 256, 1024, 1024] + - [109, 8895.0] + - - [512, 1600, 1, 512, 512, 512, 512, 512] + - [146, 8244.0] + - - [767, 1280, 1, 768, 767, 767, 768, 768] + - [148, 9139.0] + - - [769, 1280, 1, 768, 769, 769, 768, 768] + - [165, 5858.0] + - - [768, 1279, 1, 768, 768, 768, 768, 768] + - [132, 8373.0] + - - [768, 1281, 1, 768, 768, 768, 768, 768] + - [146, 5940.0] + - - [768, 1280, 1, 767, 768, 768, 767, 767] + - [143, 9612.0] + - - [768, 1280, 1, 769, 768, 768, 769, 769] + - [157, 8519.0] + - - [256, 4096, 1, 512, 256, 256, 512, 512] + - [148, 6330.0] + - - [767, 768, 1, 768, 767, 767, 768, 768] + - [109, 7725.0] + - - [769, 768, 1, 768, 769, 769, 768, 768] + - [129, 5471.0] + - - [768, 767, 1, 768, 768, 768, 768, 768] + - [163, 6834.0] + - - [768, 769, 1, 768, 768, 768, 768, 768] + - [99, 6082.0] + - - [768, 768, 1, 767, 768, 768, 767, 767] + - [157, 7596.0] + - - [768, 768, 1, 769, 768, 768, 769, 769] + - [125, 8039.0] + - - [768, 768, 1, 768, 768, 768, 768, 768] + - [163, 7342.0] + - - [128, 128, 49, 1152, 128, 128, 1152, 1152] + - [109, 8066.0] + - - [128, 128, 49, 1216, 128, 128, 1216, 1216] + - [143, 8867.0] + - - [128, 128, 36, 1800, 128, 128, 1800, 1800] + - [95, 9101.0] + - - [128, 128, 36, 1900, 128, 128, 1900, 1900] + - [165, 8053.0] + - - [128, 128, 64, 5880, 128, 128, 5880, 5880] + - [163, 8768.0] + - - [128, 128, 49, 7680, 128, 128, 7680, 7680] + - [120, 5321.0] + - - [128, 128, 64, 882, 128, 128, 882, 882] + - [95, 8497.0] + - - [128, 128, 64, 931, 128, 128, 931, 931] + - [174, 8530.0] + - - [128, 64, 121, 1152, 128, 128, 1152, 1152] + - [151, 6034.0] + - - [128, 64, 81, 12000, 128, 128, 12000, 12000] + - [162, 4759.0] + - - [128, 64, 121, 1216, 128, 128, 1216, 1216] + - [178, 8155.0] + - - [128, 64, 81, 1800, 128, 128, 1800, 1800] + - [108, 7444.0] + - - [128, 64, 81, 1900, 128, 128, 1900, 1900] + - [178, 7379.0] + - - [128, 64, 49, 20280, 128, 128, 20280, 20280] + - [130, 5081.0] + - - [128, 64, 49, 3042, 128, 128, 3042, 3042] + - [113, 7411.0] + - - [128, 64, 49, 3211, 128, 128, 3211, 3211] + - [113, 7244.0] + - - [128, 64, 169, 5880, 128, 128, 5880, 5880] + - [125, 3834.0] + - - [128, 64, 121, 7680, 128, 128, 7680, 7680] + - [135, 3393.0] + - - [128, 64, 169, 882, 128, 128, 882, 882] + - [178, 6929.0] + - - [128, 64, 169, 931, 128, 128, 931, 931] + - [178, 6834.0] + - - [256, 128, 25, 1080, 256, 256, 1080, 1080] + - [93, 10052.0] + - - [256, 128, 25, 162, 256, 256, 162, 162] + - [132, 5745.0] + - - [256, 128, 25, 171, 256, 256, 171, 171] + - [157, 6504.0] + - - [1152, 256, 1, 1, 1152, 1152, 1, 1] + - [164, 113.0] + - - [1152, 256, 1, 1444, 1152, 1152, 1444, 1444] + - [174, 6745.0] + - - [1152, 256, 1, 25, 1152, 1152, 25, 25] + - [123, 1350.0] + - - [1152, 256, 1, 9, 1152, 1152, 9, 9] + - [167, 575.0] + - - [2304, 256, 1, 1444, 2304, 2304, 1444, 1444] + - [128, 9166.0] + - - [2304, 340, 1, 1, 2304, 2304, 1, 1] + - [126, 114.0] + - - [2304, 340, 1, 1444, 2304, 2304, 1444, 1444] + - [159, 8042.0] + - - [2304, 340, 1, 9, 2304, 2304, 9, 9] + - [164, 920.0] + - - [2304, 510, 1, 25, 2304, 2304, 25, 25] + - [153, 2761.0] + - - [30522, 77, 1, 1024, 30522, 30522, 1024, 1024] + - [115, 6114.0] + - - [1024, 780, 1, 1024, 1024, 1024, 1024, 1024] + - [143, 7353.0] + - - [1024, 800, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 8235.0] + - - [1024, 820, 1, 1024, 1024, 1024, 1024, 1024] + - [180, 7961.0] + - - [1024, 385, 1, 1024, 1024, 1024, 1024, 1024] + - [146, 6449.0] + - - [1024, 462, 1, 1024, 1024, 1024, 1024, 1024] + - [109, 6525.0] + - - [64, 512, 256, 512, 64, 64, 512, 512] + - [147, 6786.0] + - - [64, 512, 128, 512, 64, 64, 512, 512] + - [179, 6622.0] + - - [64, 512, 40, 512, 64, 64, 512, 512] + - [114, 6784.0] + - - [96, 1024, 64, 1024, 96, 96, 1024, 1024] + - [180, 7481.0] + - - [96, 1024, 128, 1024, 96, 96, 1024, 1024] + - [177, 4943.0] + - - [64, 1024, 256, 1024, 64, 64, 1024, 1024] + - [133, 3820.0] + - - [64, 1024, 32, 1024, 64, 64, 1024, 1024] + - [112, 6412.0] + - - [64, 1024, 64, 1024, 64, 64, 1024, 1024] + - [177, 6409.0] + - - [64, 1024, 128, 1024, 64, 64, 1024, 1024] + - [150, 4015.0] + - - [64, 128, 1024, 128, 64, 64, 128, 128] + - [119, 5557.0] + - - [1024, 864, 1, 1024, 1024, 1024, 1024, 1024] + - [143, 7658.0] + - - [1024, 864, 1, 480, 1024, 1024, 480, 480] + - [125, 8494.0] + - - [128, 3456, 1, 256, 128, 128, 256, 256] + - [140, 6049.0] + - - [128, 4096, 1, 256, 128, 128, 256, 256] + - [140, 5770.0] + - - [128, 6912, 1, 256, 128, 128, 256, 256] + - [180, 3836.0] + - - [256, 3456, 1, 512, 256, 256, 512, 512] + - [178, 6588.0] + - - [512, 864, 1, 1024, 512, 512, 1024, 1024] + - [113, 6210.0] + - - [512, 864, 1, 13, 512, 512, 13, 13] + - [154, 962.0] + - - [64, 128, 1280, 128, 64, 64, 128, 128] + - [119, 5377.0] + - - [64, 128, 1312, 128, 64, 64, 128, 128] + - [119, 5382.0] + - - [64, 512, 192, 512, 64, 64, 512, 512] + - [147, 6818.0] + - - [1024, 512, 1, 196, 1024, 1024, 196, 196] + - [90, 6407.0] + - - [64, 128, 2048, 128, 64, 64, 128, 128] + - [150, 5435.0] + - - [64, 128, 1536, 128, 64, 64, 128, 128] + - [117, 5424.0] + - - [128, 128, 64, 6400, 128, 128, 6400, 6400] + - [187, 6594.0] + - - [64, 128, 192, 128, 64, 64, 128, 128] + - [114, 5492.0] + - - [64, 384, 144, 384, 64, 64, 384, 384] + - [147, 6686.0] + - - [64, 512, 48, 512, 64, 64, 512, 512] + - [179, 6665.0] + - - [64, 128, 256, 128, 64, 64, 128, 128] + - [179, 6745.0] + - - [64, 384, 192, 384, 64, 64, 384, 384] + - [147, 6879.0] + - - [128, 128, 49, 1120, 128, 128, 1120, 1120] + - [95, 8725.0] + - - [128, 128, 49, 1064, 128, 128, 1064, 1064] + - [140, 9583.0] + - - [128, 128, 49, 1040, 128, 128, 1040, 1040] + - [140, 8715.0] + - - [128, 128, 64, 600, 128, 128, 600, 600] + - [143, 8472.0] + - - [128, 128, 64, 616, 128, 128, 616, 616] + - [140, 8786.0] + - - [128, 128, 49, 950, 128, 128, 950, 950] + - [172, 8653.0] + - - [128, 128, 49, 972, 128, 128, 972, 972] + - [140, 9288.0] + - - [128, 128, 64, 560, 128, 128, 560, 560] + - [109, 8434.0] + - - [128, 128, 49, 1008, 128, 128, 1008, 1008] + - [172, 9390.0] + - - [128, 128, 64, 532, 128, 128, 532, 532] + - [172, 8103.0] + - - [128, 128, 49, 1080, 128, 128, 1080, 1080] + - [108, 9490.0] + - - [128, 128, 64, 588, 128, 128, 588, 588] + - [178, 8160.0] + - - [128, 128, 49, 1160, 128, 128, 1160, 1160] + - [174, 8822.0] + - - [128, 128, 49, 988, 128, 128, 988, 988] + - [108, 9369.0] + - - [128, 128, 49, 936, 128, 128, 936, 936] + - [143, 8566.0] + - - [512, 1024, 1, 3800, 512, 512, 3800, 3800] + - [128, 8667.0] + - - [512, 1024, 1, 3400, 512, 512, 3400, 3400] + - [159, 8701.0] + - - [512, 1024, 1, 3456, 512, 512, 3456, 3456] + - [159, 8212.0] + - - [2048, 512, 1, 950, 2048, 2048, 950, 950] + - [93, 9405.0] + - - [512, 1024, 1, 3552, 512, 512, 3552, 3552] + - [128, 8655.0] + - - [512, 1024, 1, 3220, 512, 512, 3220, 3220] + - [174, 8383.0] + - - [2048, 512, 1, 850, 2048, 2048, 850, 850] + - [93, 8635.0] + - - [512, 2048, 1, 864, 512, 512, 864, 864] + - [157, 8715.0] + - - [512, 2048, 1, 768, 512, 512, 768, 768] + - [113, 8261.0] + - - [2048, 512, 1, 805, 2048, 2048, 805, 805] + - [172, 9120.0] + - - [512, 1024, 1, 2852, 512, 512, 2852, 2852] + - [128, 8581.0] + - - [512, 2048, 1, 888, 512, 512, 888, 888] + - [93, 8407.0] + - - [2048, 512, 1, 864, 2048, 2048, 864, 864] + - [172, 9241.0] + - - [2048, 512, 1, 888, 2048, 2048, 888, 888] + - [95, 8705.0] + - - [2048, 256, 1, 950, 2048, 2048, 950, 950] + - [159, 7962.0] + - - [2048, 512, 1, 713, 2048, 2048, 713, 713] + - [140, 9001.0] + - - [512, 1024, 1, 2688, 512, 512, 2688, 2688] + - [95, 8154.0] + - - [512, 1024, 1, 2640, 512, 512, 2640, 2640] + - [128, 8557.0] + - - [512, 1024, 1, 2904, 512, 512, 2904, 2904] + - [174, 8500.0] + - - [1024, 512, 1, 950, 1024, 1024, 950, 950] + - [128, 7888.0] + - - [512, 2048, 1, 672, 512, 512, 672, 672] + - [93, 9443.0] + - - [512, 2048, 1, 660, 512, 512, 660, 660] + - [125, 9196.0] + - - [512, 2048, 1, 1008, 512, 512, 1008, 1008] + - [93, 9417.0] + - - [2048, 256, 1, 850, 2048, 2048, 850, 850] + - [174, 7689.0] + - - [2048, 512, 1, 726, 2048, 2048, 726, 726] + - [125, 8965.0] + - - [1024, 512, 1, 850, 1024, 1024, 850, 850] + - [128, 7871.0] + - - [2048, 512, 1, 660, 2048, 2048, 660, 660] + - [125, 8823.0] + - - [2048, 512, 1, 672, 2048, 2048, 672, 672] + - [140, 9151.0] + - - [512, 2048, 1, 840, 512, 512, 840, 840] + - [157, 9094.0] + - - [2048, 512, 1, 1008, 2048, 2048, 1008, 1008] + - [93, 9213.0] + - - [512, 2048, 1, 792, 512, 512, 792, 792] + - [125, 8777.0] + - - [1024, 512, 1, 805, 1024, 1024, 805, 805] + - [128, 7704.0] + - - [512, 2048, 1, 1050, 512, 512, 1050, 1050] + - [93, 9316.0] + - - [2048, 512, 1, 748, 2048, 2048, 748, 748] + - [125, 9065.0] + - - [2048, 256, 1, 864, 2048, 2048, 864, 864] + - [159, 7986.0] + - - [1024, 512, 1, 864, 1024, 1024, 864, 864] + - [109, 7922.0] + - - [2048, 512, 1, 875, 2048, 2048, 875, 875] + - [157, 9047.0] + - - [2048, 512, 1, 840, 2048, 2048, 840, 840] + - [172, 9210.0] + - - [2048, 512, 1, 792, 2048, 2048, 792, 792] + - [95, 8497.0] + - - [512, 2048, 1, 736, 512, 512, 736, 736] + - [125, 9166.0] + - - [2048, 256, 1, 888, 2048, 2048, 888, 888] + - [159, 8049.0] + - - [512, 2048, 1, 704, 512, 512, 704, 704] + - [93, 9267.0] + - - [512, 2048, 1, 588, 512, 512, 588, 588] + - [157, 8851.0] + - - [1024, 512, 1, 888, 1024, 1024, 888, 888] + - [128, 7972.0] + - - [512, 2048, 1, 816, 512, 512, 816, 816] + - [93, 9345.0] + - - [1024, 512, 1, 713, 1024, 1024, 713, 713] + - [174, 7341.0] + - - [2048, 512, 1, 736, 2048, 2048, 736, 736] + - [172, 9254.0] + - - [2048, 512, 1, 588, 2048, 2048, 588, 588] + - [172, 8915.0] + - - [2048, 512, 1, 704, 2048, 2048, 704, 704] + - [109, 8558.0] + - - [1024, 512, 1, 660, 1024, 1024, 660, 660] + - [159, 7493.0] + - - [2048, 256, 1, 660, 2048, 2048, 660, 660] + - [128, 7759.0] + - - [2048, 256, 1, 672, 2048, 2048, 672, 672] + - [128, 7815.0] + - - [1024, 512, 1, 672, 1024, 1024, 672, 672] + - [128, 7760.0] + - - [1024, 512, 1, 726, 1024, 1024, 726, 726] + - [174, 7379.0] + - - [512, 2048, 1, 630, 512, 512, 630, 630] + - [93, 9089.0] + - - [512, 2048, 1, 600, 512, 512, 600, 600] + - [93, 9084.0] + - - [2048, 256, 1, 805, 2048, 2048, 805, 805] + - [159, 8033.0] + - - [2048, 256, 1, 713, 2048, 2048, 713, 713] + - [159, 7692.0] + - - [2048, 256, 1, 726, 2048, 2048, 726, 726] + - [128, 7845.0] + - - [320, 1024, 1, 1024, 320, 320, 1024, 1024] + - [110, 7011.0] + - - [1024, 1000, 1, 1024, 1024, 1024, 1024, 1024] + - [113, 7857.0] + - - [320, 1000, 1, 1024, 320, 320, 1024, 1024] + - [144, 6509.0] + - - [128, 128, 49, 1280, 128, 128, 1280, 1280] + - [113, 8491.0] + - - [128, 128, 49, 1360, 128, 128, 1360, 1360] + - [157, 9484.0] + - - [128, 128, 49, 1200, 128, 128, 1200, 1200] + - [125, 9339.0] + - - [128, 128, 49, 1240, 128, 128, 1240, 1240] + - [174, 8782.0] + - - [2304, 256, 1, 704, 2304, 2304, 704, 704] + - [159, 8464.0] + - - [2304, 256, 1, 736, 2304, 2304, 736, 736] + - [128, 9059.0] + - - [2304, 256, 1, 792, 2304, 2304, 792, 792] + - [180, 7977.0] + - - [2304, 256, 1, 748, 2304, 2304, 748, 748] + - [128, 8517.0] + - - [2304, 256, 1, 726, 2304, 2304, 726, 726] + - [128, 8679.0] + - - [2304, 256, 1, 713, 2304, 2304, 713, 713] + - [128, 8593.0] + - - [2304, 256, 1, 768, 2304, 2304, 768, 768] + - [109, 8054.0] + - - [512, 2048, 1, 759, 512, 512, 759, 759] + - [125, 9152.0] + - - [512, 2048, 1, 925, 512, 512, 925, 925] + - [93, 9285.0] + - - [2304, 256, 1, 805, 2304, 2304, 805, 805] + - [109, 8735.0] + - - [512, 2048, 1, 900, 512, 512, 900, 900] + - [157, 9076.0] + - - [512, 2048, 1, 875, 512, 512, 875, 875] + - [93, 9277.0] + - - [512, 2048, 1, 748, 512, 512, 748, 748] + - [90, 8787.0] + - - [512, 2048, 1, 726, 512, 512, 726, 726] + - [93, 9241.0] + - - [512, 2048, 1, 713, 512, 512, 713, 713] + - [125, 9104.0] + - - [512, 2048, 1, 805, 512, 512, 805, 805] + - [93, 9440.0] + - - [512, 2048, 1, 850, 512, 512, 850, 850] + - [93, 9300.0] + - - [512, 2048, 1, 950, 512, 512, 950, 950] + - [93, 9272.0] + - - [96, 1024, 160, 1024, 96, 96, 1024, 1024] + - [148, 3951.0] + - - [96, 1024, 40, 1024, 96, 96, 1024, 1024] + - [180, 7539.0] + - - [96, 1024, 80, 1024, 96, 96, 1024, 1024] + - [146, 6109.0] + - - [96, 1024, 96, 1024, 96, 96, 1024, 1024] + - [113, 4574.0] + - - [96, 1024, 24, 1024, 96, 96, 1024, 1024] + - [115, 7172.0] + - - [96, 1024, 48, 1024, 96, 96, 1024, 1024] + - [180, 7444.0] + - - [96, 1024, 16, 1024, 96, 96, 1024, 1024] + - [187, 5849.0] + - - [96, 1024, 32, 1024, 96, 96, 1024, 1024] + - [148, 7451.0] + - - [64, 512, 320, 512, 64, 64, 512, 512] + - [150, 5424.0] + - - [64, 512, 80, 512, 64, 64, 512, 512] + - [147, 6344.0] + - - [29000, 109, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 7307.0] + - - [29000, 121, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 8042.0] + - - [29000, 65, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 4865.0] + - - [29000, 66, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 4757.0] + - - [29000, 67, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 4784.0] + - - [29000, 69, 1, 2560, 29000, 29000, 2560, 2560] + - [115, 5429.0] + - - [29000, 70, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 4998.0] + - - [29000, 71, 1, 2560, 29000, 29000, 2560, 2560] + - [115, 5304.0] + - - [29000, 73, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 5197.0] + - - [29000, 74, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 5250.0] + - - [29000, 75, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 5181.0] + - - [29000, 77, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 5210.0] + - - [29000, 78, 1, 2560, 29000, 29000, 2560, 2560] + - [115, 5490.0] + - - [29000, 80, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 5668.0] + - - [29000, 81, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 5657.0] + - - [29000, 82, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 5819.0] + - - [29000, 83, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 5944.0] + - - [29000, 84, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 5736.0] + - - [29000, 88, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 6080.0] + - - [29000, 89, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 6065.0] + - - [29000, 90, 1, 2560, 29000, 29000, 2560, 2560] + - [180, 6216.0] + - - [29000, 92, 1, 2560, 29000, 29000, 2560, 2560] + - [148, 6260.0] + - - [29000, 95, 1, 2560, 29000, 29000, 2560, 2560] + - [115, 6651.0] + - - [29000, 98, 1, 2560, 29000, 29000, 2560, 2560] + - [120, 6495.0] + - - [64, 1024, 512, 1024, 64, 64, 1024, 1024] + - [114, 5299.0] + - - [1024, 200, 1, 13312, 1024, 1024, 13312, 13312] + - [193, 6300.0] + - - [1024, 256, 1, 15360, 1024, 1024, 15360, 15360] + - [206, 5804.0] + - - [1024, 256, 1, 16384, 1024, 1024, 16384, 16384] + - [206, 5593.0] + - - [1024, 200, 1, 16384, 1024, 1024, 16384, 16384] + - [199, 5576.0] + - - [1024, 256, 1, 12288, 1024, 1024, 12288, 12288] + - [193, 7881.0] + - - [1024, 200, 1, 12288, 1024, 1024, 12288, 12288] + - [195, 6293.0] + - - [1024, 200, 1, 15360, 1024, 1024, 15360, 15360] + - [195, 5194.0] + - - [1024, 256, 1, 9216, 1024, 1024, 9216, 9216] + - [193, 7779.0] + - - [1024, 200, 1, 14336, 1024, 1024, 14336, 14336] + - [198, 5220.0] + - - [1024, 256, 1, 16640, 1024, 1024, 16640, 16640] + - [187, 6267.0] + - - [1024, 200, 1, 8192, 1024, 1024, 8192, 8192] + - [193, 6003.0] + - - [1024, 200, 1, 10240, 1024, 1024, 10240, 10240] + - [193, 6162.0] + - - [1024, 200, 1, 9216, 1024, 1024, 9216, 9216] + - [193, 6130.0] + - - [1024, 256, 1, 11264, 1024, 1024, 11264, 11264] + - [193, 7972.0] + - - [1024, 200, 1, 8320, 1024, 1024, 8320, 8320] + - [209, 6043.0] + - - [1024, 256, 1, 8320, 1024, 1024, 8320, 8320] + - [190, 7783.0] + - - [1024, 200, 1, 16640, 1024, 1024, 16640, 16640] + - [191, 5584.0] + - - [1024, 256, 1, 14336, 1024, 1024, 14336, 14336] + - [215, 6068.0] + - - [1024, 256, 1, 13312, 1024, 1024, 13312, 13312] + - [193, 8032.0] + - - [1024, 200, 1, 11264, 1024, 1024, 11264, 11264] + - [193, 6281.0] + - - [1024, 256, 1, 8192, 1024, 1024, 8192, 8192] + - [195, 7444.0] + - - [1024, 256, 1, 10240, 1024, 1024, 10240, 10240] + - [193, 7838.0] + - - [96, 64, 64, 18432, 96, 96, 18432, 18432] + - [194, 2841.0] + - - [96, 64, 36, 10368, 96, 96, 10368, 10368] + - [178, 5127.0] + - - [96, 64, 36, 20736, 96, 96, 20736, 20736] + - [184, 4615.0] + - - [96, 96, 36, 10368, 96, 96, 10368, 10368] + - [203, 5125.0] + - - [96, 64, 49, 28800, 96, 96, 28800, 28800] + - [205, 3456.0] + - - [96, 64, 36, 41472, 96, 96, 41472, 41472] + - [103, 3212.0] + - - [64, 64, 11, 233600, 64, 64, 233600, 233600] + - [211, 3131.0] + - - [64, 64, 11, 116800, 64, 64, 116800, 116800] + - [189, 3248.0] + - - [64, 64, 9, 172864, 64, 64, 172864, 172864] + - [202, 3970.0] + - - [64, 64, 11, 58400, 64, 64, 58400, 58400] + - [202, 4757.0] + - - [192, 160, 9, 19584, 192, 192, 19584, 19584] + - [204, 4956.0] + - - [128, 128, 9, 9792, 128, 128, 9792, 9792] + - [188, 8168.0] + - - [192, 160, 11, 13056, 192, 192, 13056, 13056] + - [200, 5152.0] + - - [64, 64, 9, 86432, 64, 64, 86432, 86432] + - [202, 5244.0] + - - [128, 128, 9, 19584, 128, 128, 19584, 19584] + - [196, 7639.0] + - - [160, 160, 11, 13056, 160, 160, 13056, 13056] + - [112, 4426.0] + - - [160, 160, 9, 19584, 160, 160, 19584, 19584] + - [163, 4168.0] + - - [192, 128, 9, 19584, 192, 192, 19584, 19584] + - [215, 6751.0] + - - [192, 160, 9, 9792, 192, 192, 9792, 9792] + - [208, 6229.0] + - - [64, 64, 9, 345728, 64, 64, 345728, 345728] + - [197, 3436.0] + - - [128, 128, 11, 13056, 128, 128, 13056, 13056] + - [206, 6214.0] + - - [160, 160, 9, 9792, 160, 160, 9792, 9792] + - [208, 5141.0] + - - [192, 128, 11, 13056, 192, 192, 13056, 13056] + - [214, 5981.0] + - - [192, 128, 9, 9792, 192, 192, 9792, 9792] + - [208, 7247.0] + - - [128, 64, 25, 43320, 128, 128, 43320, 43320] + - [178, 4709.0] + - - [64, 64, 64, 20280, 64, 64, 20280, 20280] + - [96, 4435.0] + - - [64, 64, 49, 27000, 64, 64, 27000, 27000] + - [111, 3899.0] + - - [64, 64, 36, 43320, 64, 64, 43320, 43320] + - [181, 3879.0] + - - [64, 64, 36, 50176, 64, 64, 50176, 50176] + - [182, 2903.0] + - - [64, 64, 49, 36864, 64, 64, 36864, 36864] + - [212, 2981.0] + - - [64, 64, 64, 25600, 64, 64, 25600, 25600] + - [192, 2715.0] + - - [256, 256, 1, 60800, 256, 256, 60800, 60800] + - [216, 6369.0] + - - [256, 256, 1, 54400, 256, 256, 54400, 54400] + - [206, 6350.0] + - - [256, 256, 1, 51520, 256, 256, 51520, 51520] + - [215, 7383.0] + - - [256, 256, 1, 55296, 256, 256, 55296, 55296] + - [216, 6411.0] + - - [256, 256, 1, 56832, 256, 256, 56832, 56832] + - [216, 6419.0] + - - [256, 256, 1, 45632, 256, 256, 45632, 45632] + - [199, 7457.0] + - - [256, 256, 1, 49152, 256, 256, 49152, 49152] + - [206, 6354.0] + - - [256, 512, 1, 13600, 256, 256, 13600, 13600] + - [207, 7980.0] + - - [256, 256, 1, 43008, 256, 256, 43008, 43008] + - [213, 6798.0] + - - [256, 512, 1, 15200, 256, 256, 15200, 15200] + - [201, 8080.0] + - - [256, 512, 1, 12880, 256, 256, 12880, 12880] + - [188, 8054.0] + - - [256, 512, 1, 13824, 256, 256, 13824, 13824] + - [210, 7600.0] + - - [512, 256, 1, 13824, 512, 512, 13824, 13824] + - [215, 7723.0] + - - [256, 512, 1, 14208, 256, 256, 14208, 14208] + - [215, 7599.0] + - - [512, 256, 1, 14208, 512, 512, 14208, 14208] + - [199, 7736.0] + - - [512, 256, 1, 15200, 512, 512, 15200, 15200] + - [201, 8308.0] + - - [256, 512, 1, 12288, 256, 256, 12288, 12288] + - [191, 7410.0] + - - [512, 256, 1, 12288, 512, 512, 12288, 12288] + - [203, 7607.0] + - - [1024, 200, 1, 560, 1024, 1024, 560, 560] + - [262, 2998.0] + - - [768, 320, 1, 768, 768, 768, 768, 768] + - [263, 4170.0] + - - [1024, 120, 1, 1024, 1024, 1024, 1024, 1024] + - [263, 3115.0] + - - [1024, 128, 1, 128, 1024, 1024, 128, 128] + - [248, 2680.0] + - - [2368, 64, 1, 3328, 2368, 2368, 3328, 3328] + - [248, 3915.0] + - - [1408, 64, 1, 1280, 1408, 1408, 1280, 1280] + - [230, 2755.0] + - - [4096, 32, 1, 4096, 4096, 4096, 4096, 4096] + - [230, 3153.0] + - - [3072, 64, 1, 1024, 3072, 3072, 1024, 1024] + - [230, 3377.0] + - - [2944, 64, 1, 256, 2944, 2944, 256, 256] + - [262, 3041.0] + - - [6144, 32, 1, 2560, 6144, 6144, 2560, 2560] + - [230, 3467.0] + - - [1856, 64, 1, 1280, 1856, 1856, 1280, 1280] + - [262, 3333.0] + - - [704, 128, 1, 1280, 704, 704, 1280, 1280] + - [230, 2643.0] + - - [4288, 64, 1, 3328, 4288, 4288, 3328, 3328] + - [232, 3627.0] + - - [64, 3584, 1, 3328, 64, 64, 3328, 3328] + - [263, 3987.0] + - - [704, 256, 1, 128, 704, 704, 128, 128] + - [255, 2759.0] + - - [128, 1408, 1, 128, 128, 128, 128, 128] + - [220, 2773.0] + - - [448, 448, 1, 256, 448, 448, 256, 256] + - [232, 3268.0] + - - [7680, 32, 1, 2560, 7680, 7680, 2560, 2560] + - [230, 3539.0] + - - [128, 1024, 1, 3328, 128, 128, 3328, 3328] + - [248, 3404.0] + - - [64, 1856, 1, 1280, 64, 64, 1280, 1280] + - [230, 3414.0] + - - [256, 1024, 1, 256, 256, 256, 256, 256] + - [263, 3267.0] + - - [1024, 128, 1, 1280, 1024, 1024, 1280, 1280] + - [263, 3331.0] + - - [3072, 32, 1, 1024, 3072, 3072, 1024, 1024] + - [262, 2804.0] + - - [448, 256, 1, 3328, 448, 448, 3328, 3328] + - [262, 3388.0] + - - [128, 1024, 1, 128, 128, 128, 128, 128] + - [255, 2724.0] + - - [448, 448, 1, 3328, 448, 448, 3328, 3328] + - [248, 3524.0] + - - [128, 704, 1, 1280, 128, 128, 1280, 1280] + - [262, 2669.0] + - - [1856, 128, 1, 3328, 1856, 1856, 3328, 3328] + - [232, 4124.0] + - - [35, 8457, 1, 1760, 35, 35, 1760, 1760] + - [241, 2271.0] + - - [64, 2944, 1, 128, 64, 64, 128, 128] + - [262, 2913.0] + - - [8448, 32, 1, 2816, 8448, 8448, 2816, 2816] + - [230, 3361.0] + - - [1408, 128, 1, 1280, 1408, 1408, 1280, 1280] + - [255, 3315.0] + - - [128, 1856, 1, 1280, 128, 128, 1280, 1280] + - [232, 4085.0] + - - [2560, 64, 1, 2560, 2560, 2560, 2560, 2560] + - [263, 4193.0] + - - [256, 448, 1, 256, 256, 256, 256, 256] + - [262, 2978.0] + - - [128, 1856, 1, 128, 128, 128, 128, 128] + - [241, 3553.0] + - - [2560, 32, 1, 2560, 2560, 2560, 2560, 2560] + - [262, 3339.0] + - - [128, 1408, 1, 256, 128, 128, 256, 256] + - [262, 3004.0] + - - [35, 8457, 1, 2560, 35, 35, 2560, 2560] + - [248, 2238.0] + - - [4288, 64, 1, 128, 4288, 4288, 128, 128] + - [232, 3153.0] + - - [256, 448, 1, 3328, 256, 256, 3328, 3328] + - [230, 3335.0] + - - [64, 2368, 1, 1280, 64, 64, 1280, 1280] + - [232, 3814.0] + - - [2368, 64, 1, 256, 2368, 2368, 256, 256] + - [263, 3489.0] + - - [704, 128, 1, 3328, 704, 704, 3328, 3328] + - [230, 2667.0] + - - [4288, 64, 1, 1280, 4288, 4288, 1280, 1280] + - [248, 3599.0] + - - [1408, 128, 1, 128, 1408, 1408, 128, 128] + - [262, 2759.0] + - - [128, 1024, 1, 1280, 128, 128, 1280, 1280] + - [232, 3324.0] + - - [2944, 64, 1, 128, 2944, 2944, 128, 128] + - [253, 2811.0] + - - [1024, 128, 1, 3328, 1024, 1024, 3328, 3328] + - [248, 3416.0] + - - [704, 128, 1, 256, 704, 704, 256, 256] + - [262, 2298.0] + - - [448, 256, 1, 1280, 448, 448, 1280, 1280] + - [263, 2967.0] + - - [1856, 128, 1, 1280, 1856, 1856, 1280, 1280] + - [263, 4053.0] + - - [64, 3584, 1, 256, 64, 64, 256, 256] + - [232, 3620.0] + - - [3584, 64, 1, 128, 3584, 3584, 128, 128] + - [255, 3336.0] + - - [256, 1024, 1, 1280, 256, 256, 1280, 1280] + - [232, 3461.0] + - - [3584, 64, 1, 1280, 3584, 3584, 1280, 1280] + - [248, 3924.0] + - - [64, 4288, 1, 3328, 64, 64, 3328, 3328] + - [232, 3635.0] + - - [64, 1856, 1, 256, 64, 64, 256, 256] + - [262, 2842.0] + - - [35, 8457, 1, 2048, 35, 35, 2048, 2048] + - [263, 2262.0] + - - [256, 704, 1, 256, 256, 256, 256, 256] + - [247, 2898.0] + - - [2368, 64, 1, 128, 2368, 2368, 128, 128] + - [232, 2811.0] + - - [256, 1024, 1, 128, 256, 256, 128, 128] + - [255, 3160.0] + - - [704, 256, 1, 3328, 704, 704, 3328, 3328] + - [247, 3213.0] + - - [35, 8457, 1, 4096, 35, 35, 4096, 4096] + - [248, 2202.0] + - - [64, 2944, 1, 256, 64, 64, 256, 256] + - [262, 3018.0] + - - [448, 256, 1, 128, 448, 448, 128, 128] + - [220, 2575.0] + - - [64, 1408, 1, 1280, 64, 64, 1280, 1280] + - [262, 2619.0] + - - [1408, 128, 1, 256, 1408, 1408, 256, 256] + - [262, 2887.0] + - - [64, 2944, 1, 1280, 64, 64, 1280, 1280] + - [262, 3321.0] + - - [128, 704, 1, 128, 128, 128, 128, 128] + - [262, 1935.0] + - - [64, 1408, 1, 3328, 64, 64, 3328, 3328] + - [230, 2694.0] + - - [256, 448, 1, 1280, 256, 256, 1280, 1280] + - [262, 3317.0] + - - [704, 256, 1, 1280, 704, 704, 1280, 1280] + - [262, 3187.0] + - - [64, 2368, 1, 3328, 64, 64, 3328, 3328] + - [263, 3921.0] + - - [1856, 64, 1, 128, 1856, 1856, 128, 128] + - [230, 2325.0] + - - [4096, 64, 1, 4096, 4096, 4096, 4096, 4096] + - [263, 3459.0] + - - [1760, 128, 1, 1760, 1760, 1760, 1760, 1760] + - [255, 3933.0] + - - [704, 128, 1, 128, 704, 704, 128, 128] + - [262, 1814.0] + - - [256, 704, 1, 3328, 256, 256, 3328, 3328] + - [222, 3383.0] + - - [256, 448, 1, 128, 256, 256, 128, 128] + - [262, 2315.0] + - - [64, 3584, 1, 128, 64, 64, 128, 128] + - [232, 3375.0] + - - [64, 2944, 1, 3328, 64, 64, 3328, 3328] + - [247, 3390.0] + - - [1024, 128, 1, 256, 1024, 1024, 256, 256] + - [263, 3039.0] + - - [2944, 64, 1, 1280, 2944, 2944, 1280, 1280] + - [247, 3288.0] + - - [128, 1408, 1, 3328, 128, 128, 3328, 3328] + - [230, 3237.0] + - - [1408, 64, 1, 256, 1408, 1408, 256, 256] + - [262, 2340.0] + - - [64, 1856, 1, 128, 64, 64, 128, 128] + - [262, 2677.0] + - - [64, 2368, 1, 256, 64, 64, 256, 256] + - [232, 3508.0] + - - [1856, 128, 1, 128, 1856, 1856, 128, 128] + - [255, 3569.0] + - - [2368, 64, 1, 1280, 2368, 2368, 1280, 1280] + - [232, 3872.0] + - - [4288, 64, 1, 256, 4288, 4288, 256, 256] + - [263, 3424.0] + - - [64, 4288, 1, 1280, 64, 64, 1280, 1280] + - [248, 3596.0] + - - [1408, 64, 1, 3328, 1408, 1408, 3328, 3328] + - [262, 2679.0] + - - [64, 1408, 1, 128, 64, 64, 128, 128] + - [262, 2067.0] + - - [256, 704, 1, 128, 256, 256, 128, 128] + - [220, 2779.0] + - - [1408, 64, 1, 128, 1408, 1408, 128, 128] + - [262, 2038.0] + - - [448, 448, 1, 1280, 448, 448, 1280, 1280] + - [262, 3507.0] + - - [128, 1024, 1, 256, 128, 128, 256, 256] + - [232, 3062.0] + - - [3584, 64, 1, 3328, 3584, 3584, 3328, 3328] + - [232, 3946.0] + - - [256, 1024, 1, 3328, 256, 256, 3328, 3328] + - [232, 3634.0] + - - [1856, 64, 1, 3328, 1856, 1856, 3328, 3328] + - [262, 3460.0] + - - [448, 256, 1, 256, 448, 448, 256, 256] + - [262, 2948.0] + - - [4608, 32, 1, 1536, 4608, 4608, 1536, 1536] + - [230, 3230.0] + - - [128, 704, 1, 256, 128, 128, 256, 256] + - [262, 2368.0] + - - [64, 3584, 1, 1280, 64, 64, 1280, 1280] + - [232, 3941.0] + - - [3584, 64, 1, 256, 3584, 3584, 256, 256] + - [263, 3656.0] + - - [64, 1856, 1, 3328, 64, 64, 3328, 3328] + - [262, 3525.0] + - - [2048, 128, 1, 2048, 2048, 2048, 2048, 2048] + - [232, 3592.0] + - - [1408, 128, 1, 3328, 1408, 1408, 3328, 3328] + - [262, 3214.0] + - - [128, 704, 1, 3328, 128, 128, 3328, 3328] + - [230, 2717.0] + - - [128, 1856, 1, 256, 128, 128, 256, 256] + - [232, 3835.0] + - - [64, 4288, 1, 256, 64, 64, 256, 256] + - [248, 3417.0] + - - [1856, 64, 1, 256, 1856, 1856, 256, 256] + - [262, 3023.0] + - - [256, 704, 1, 1280, 256, 256, 1280, 1280] + - [263, 3399.0] + - - [64, 2368, 1, 128, 64, 64, 128, 128] + - [232, 3149.0] + - - [64, 4288, 1, 128, 64, 64, 128, 128] + - [232, 3235.0] + - - [1856, 128, 1, 256, 1856, 1856, 256, 256] + - [263, 3854.0] + - - [2048, 64, 1, 2048, 2048, 2048, 2048, 2048] + - [263, 3391.0] + - - [64, 1408, 1, 256, 64, 64, 256, 256] + - [262, 2344.0] + - - [2944, 64, 1, 3328, 2944, 2944, 3328, 3328] + - [247, 3342.0] + - - [128, 1408, 1, 1280, 128, 128, 1280, 1280] + - [247, 3188.0] + - - [128, 1856, 1, 3328, 128, 128, 3328, 3328] + - [232, 4136.0] + - - [1760, 64, 1, 1760, 1760, 1760, 1760, 1760] + - [240, 3298.0] + - - [448, 448, 1, 128, 448, 448, 128, 128] + - [240, 3073.0] + - - [704, 256, 1, 256, 704, 704, 256, 256] + - [232, 2954.0] + - - [256, 1024, 1, 196, 256, 256, 196, 196] + - [255, 3350.0] + - - [1024, 256, 1, 1536, 1024, 1024, 1536, 1536] + - [263, 3478.0] + - - [1024, 200, 1, 1408, 1024, 1024, 1408, 1408] + - [240, 3075.0] + - - [1024, 200, 1, 6144, 1024, 1024, 6144, 6144] + - [230, 3072.0] + - - [1024, 256, 1, 3328, 1024, 1024, 3328, 3328] + - [263, 3492.0] + - - [512, 256, 1, 3200, 512, 512, 3200, 3200] + - [241, 3463.0] + - - [1024, 200, 1, 4608, 1024, 1024, 4608, 4608] + - [247, 3084.0] + - - [512, 256, 1, 1792, 512, 512, 1792, 1792] + - [232, 3387.0] + - - [1024, 200, 1, 1792, 1024, 1024, 1792, 1792] + - [247, 3060.0] + - - [512, 200, 1, 2816, 512, 512, 2816, 2816] + - [230, 3009.0] + - - [512, 200, 1, 3072, 512, 512, 3072, 3072] + - [247, 3046.0] + - - [1024, 200, 1, 128, 1024, 1024, 128, 128] + - [240, 2653.0] + - - [1024, 200, 1, 5120, 1024, 1024, 5120, 5120] + - [247, 3088.0] + - - [1024, 256, 1, 256, 1024, 1024, 256, 256] + - [232, 3484.0] + - - [512, 256, 1, 2560, 512, 512, 2560, 2560] + - [248, 3423.0] + - - [1024, 256, 1, 4160, 1024, 1024, 4160, 4160] + - [255, 3660.0] + - - [1024, 200, 1, 512, 1024, 1024, 512, 512] + - [247, 2952.0] + - - [512, 512, 1, 1536, 512, 512, 1536, 1536] + - [248, 3487.0] + - - [1024, 256, 1, 896, 1024, 1024, 896, 896] + - [248, 3590.0] + - - [1024, 200, 1, 3200, 1024, 1024, 3200, 3200] + - [240, 3098.0] + - - [1024, 200, 1, 1536, 1024, 1024, 1536, 1536] + - [247, 3055.0] + - - [1024, 256, 1, 1024, 1024, 1024, 1024, 1024] + - [248, 3594.0] + - - [128, 1024, 1, 512, 128, 128, 512, 512] + - [232, 3267.0] + - - [1024, 256, 1, 5120, 1024, 1024, 5120, 5120] + - [248, 3713.0] + - - [1024, 200, 1, 2304, 1024, 1024, 2304, 2304] + - [247, 2981.0] + - - [1024, 256, 1, 1664, 1024, 1024, 1664, 1664] + - [255, 3636.0] + - - [512, 512, 1, 1024, 512, 512, 1024, 1024] + - [248, 3593.0] + - - [1024, 256, 1, 2080, 1024, 1024, 2080, 2080] + - [263, 3728.0] + - - [512, 200, 1, 768, 512, 512, 768, 768] + - [247, 2932.0] + - - [1024, 256, 1, 2816, 1024, 1024, 2816, 2816] + - [232, 3711.0] + - - [1024, 200, 1, 64, 1024, 1024, 64, 64] + - [262, 2409.0] + - - [512, 512, 1, 2304, 512, 512, 2304, 2304] + - [232, 3611.0] + - - [128, 1024, 1, 2048, 128, 128, 2048, 2048] + - [232, 3423.0] + - - [512, 200, 1, 2560, 512, 512, 2560, 2560] + - [262, 3015.0] + - - [512, 256, 1, 1024, 512, 512, 1024, 1024] + - [248, 3369.0] + - - [1024, 256, 1, 1920, 1024, 1024, 1920, 1920] + - [241, 3629.0] + - - [512, 200, 1, 2304, 512, 512, 2304, 2304] + - [247, 2998.0] + - - [1024, 256, 1, 384, 1024, 1024, 384, 384] + - [241, 3495.0] + - - [1024, 256, 1, 32, 1024, 1024, 32, 32] + - [262, 2255.0] + - - [1024, 200, 1, 2816, 1024, 1024, 2816, 2816] + - [230, 3045.0] + - - [1024, 200, 1, 3072, 1024, 1024, 3072, 3072] + - [247, 3087.0] + - - [512, 256, 1, 1536, 512, 512, 1536, 1536] + - [232, 3385.0] + - - [1024, 256, 1, 512, 1024, 1024, 512, 512] + - [232, 3608.0] + - - [256, 512, 1, 512, 256, 256, 512, 512] + - [232, 3258.0] + - - [1024, 200, 1, 3840, 1024, 1024, 3840, 3840] + - [230, 2991.0] + - - [256, 1024, 1, 512, 256, 256, 512, 512] + - [232, 3398.0] + - - [1024, 256, 1, 1152, 1024, 1024, 1152, 1152] + - [222, 3696.0] + - - [512, 512, 1, 2816, 512, 512, 2816, 2816] + - [248, 3494.0] + - - [512, 200, 1, 1280, 512, 512, 1280, 1280] + - [247, 2917.0] + - - [512, 200, 1, 3200, 512, 512, 3200, 3200] + - [240, 3072.0] + - - [1024, 256, 1, 2304, 1024, 1024, 2304, 2304] + - [263, 3498.0] + - - [1024, 256, 1, 6144, 1024, 1024, 6144, 6144] + - [232, 3608.0] + - - [1024, 200, 1, 2560, 1024, 1024, 2560, 2560] + - [247, 3074.0] + - - [1024, 256, 1, 5632, 1024, 1024, 5632, 5632] + - [232, 3615.0] + - - [512, 256, 1, 768, 512, 512, 768, 768] + - [263, 3290.0] + - - [1024, 256, 1, 3072, 1024, 1024, 3072, 3072] + - [263, 3624.0] + - - [256, 512, 1, 2048, 256, 256, 2048, 2048] + - [232, 3427.0] + - - [1024, 200, 1, 1152, 1024, 1024, 1152, 1152] + - [240, 3070.0] + - - [512, 512, 1, 3072, 512, 512, 3072, 3072] + - [263, 3617.0] + - - [1024, 200, 1, 1664, 1024, 1024, 1664, 1664] + - [253, 3082.0] + - - [1024, 200, 1, 32, 1024, 1024, 32, 32] + - [222, 1655.0] + - - [1024, 200, 1, 384, 1024, 1024, 384, 384] + - [253, 2970.0] + - - [512, 256, 1, 2304, 512, 512, 2304, 2304] + - [248, 3401.0] + - - [256, 512, 1, 1024, 256, 256, 1024, 1024] + - [232, 3366.0] + - - [1024, 200, 1, 3328, 1024, 1024, 3328, 3328] + - [262, 3044.0] + - - [1024, 200, 1, 2080, 1024, 1024, 2080, 2080] + - [253, 3091.0] + - - [512, 200, 1, 1792, 512, 512, 1792, 1792] + - [247, 2995.0] + - - [1024, 256, 1, 1792, 1024, 1024, 1792, 1792] + - [255, 3526.0] + - - [1024, 200, 1, 7168, 1024, 1024, 7168, 7168] + - [247, 3064.0] + - - [512, 256, 1, 3072, 512, 512, 3072, 3072] + - [263, 3420.0] + - - [1024, 200, 1, 2048, 1024, 1024, 2048, 2048] + - [247, 3067.0] + - - [512, 512, 1, 1280, 512, 512, 1280, 1280] + - [255, 3521.0] + - - [1024, 200, 1, 1280, 1024, 1024, 1280, 1280] + - [247, 3032.0] + - - [512, 200, 1, 512, 512, 512, 512, 512] + - [262, 2868.0] + - - [1024, 256, 1, 2560, 1024, 1024, 2560, 2560] + - [248, 3496.0] + - - [1024, 200, 1, 1024, 1024, 1024, 1024, 1024] + - [247, 3011.0] + - - [1024, 256, 1, 3200, 1024, 1024, 3200, 3200] + - [263, 3634.0] + - - [512, 512, 1, 2560, 512, 512, 2560, 2560] + - [232, 3464.0] + - - [1024, 256, 1, 640, 1024, 1024, 640, 640] + - [255, 3570.0] + - - [1024, 256, 1, 3584, 1024, 1024, 3584, 3584] + - [263, 3639.0] + - - [512, 512, 1, 3200, 512, 512, 3200, 3200] + - [222, 3714.0] + - - [1024, 256, 1, 7680, 1024, 1024, 7680, 7680] + - [241, 3631.0] + - - [512, 200, 1, 1536, 512, 512, 1536, 1536] + - [262, 2957.0] + - - [512, 256, 1, 2816, 512, 512, 2816, 2816] + - [255, 3395.0] + - - [1024, 200, 1, 768, 1024, 1024, 768, 768] + - [247, 3026.0] + - - [512, 200, 1, 2048, 512, 512, 2048, 2048] + - [262, 3011.0] + - - [1024, 256, 1, 128, 1024, 1024, 128, 128] + - [248, 3113.0] + - - [1024, 200, 1, 4096, 1024, 1024, 4096, 4096] + - [262, 3094.0] + - - [1024, 256, 1, 1280, 1024, 1024, 1280, 1280] + - [263, 3614.0] + - - [1024, 200, 1, 896, 1024, 1024, 896, 896] + - [247, 3049.0] + - - [1024, 256, 1, 4608, 1024, 1024, 4608, 4608] + - [241, 3509.0] + - - [128, 1024, 1, 1024, 128, 128, 1024, 1024] + - [248, 3349.0] + - - [1024, 256, 1, 2048, 1024, 1024, 2048, 2048] + - [263, 3497.0] + - - [512, 256, 1, 1280, 512, 512, 1280, 1280] + - [232, 3349.0] + - - [256, 1024, 1, 2048, 256, 256, 2048, 2048] + - [232, 3628.0] + - - [512, 512, 1, 2048, 512, 512, 2048, 2048] + - [263, 3596.0] + - - [512, 256, 1, 512, 512, 512, 512, 512] + - [263, 3245.0] + - - [1024, 200, 1, 7680, 1024, 1024, 7680, 7680] + - [262, 3025.0] + - - [1024, 200, 1, 6656, 1024, 1024, 6656, 6656] + - [247, 3087.0] + - - [512, 200, 1, 1024, 512, 512, 1024, 1024] + - [247, 2967.0] + - - [1024, 256, 1, 3840, 1024, 1024, 3840, 3840] + - [241, 3606.0] + - - [512, 512, 1, 768, 512, 512, 768, 768] + - [248, 3568.0] + - - [1024, 256, 1, 64, 1024, 1024, 64, 64] + - [222, 2913.0] + - - [1024, 200, 1, 1920, 1024, 1024, 1920, 1920] + - [240, 3088.0] + - - [1024, 256, 1, 7168, 1024, 1024, 7168, 7168] + - [248, 3630.0] + - - [512, 512, 1, 1792, 512, 512, 1792, 1792] + - [248, 3617.0] + - - [1024, 200, 1, 256, 1024, 1024, 256, 256] + - [247, 2871.0] + - - [256, 1024, 1, 1024, 256, 256, 1024, 1024] + - [263, 3673.0] + - - [1024, 200, 1, 640, 1024, 1024, 640, 640] + - [262, 3030.0] + - - [1024, 200, 1, 4160, 1024, 1024, 4160, 4160] + - [240, 3114.0] + - - [1024, 200, 1, 5632, 1024, 1024, 5632, 5632] + - [247, 3097.0] + - - [1024, 256, 1, 6656, 1024, 1024, 6656, 6656] + - [248, 3725.0] + - - [1024, 256, 1, 768, 1024, 1024, 768, 768] + - [263, 3572.0] + - - [512, 256, 1, 2048, 512, 512, 2048, 2048] + - [232, 3388.0] + - - [1024, 200, 1, 3584, 1024, 1024, 3584, 3584] + - [247, 3091.0] + - - [1024, 256, 1, 1408, 1024, 1024, 1408, 1408] + - [255, 3630.0] + - - [1024, 256, 1, 4096, 1024, 1024, 4096, 4096] + - [263, 3715.0] + - - [1024, 128, 1, 289, 1024, 1024, 289, 289] + - [255, 3065.0] + - - [768, 192, 1, 289, 768, 768, 289, 289] + - [241, 3426.0] + - - [32, 32, 1984, 64, 32, 32, 64, 64] + - [257, 2380.0] + - - [54, 54, 1184, 64, 54, 54, 64, 64] + - [222, 3041.0] + - - [35, 35, 1808, 64, 35, 35, 64, 64] + - [255, 1307.0] + - - [45, 45, 1424, 64, 45, 45, 64, 64] + - [232, 2134.0] + - - [49, 49, 1296, 64, 49, 49, 64, 64] + - [255, 2532.0] + - - [59, 59, 1088, 64, 59, 59, 64, 64] + - [222, 3618.0] + - - [41, 41, 1552, 64, 41, 41, 64, 64] + - [255, 1787.0] + - - [38, 38, 1680, 64, 38, 38, 64, 64] + - [222, 1542.0] + - - [2048, 128, 1, 4096, 2048, 2048, 4096, 4096] + - [248, 3712.0] + - - [1024, 128, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 3285.0] + - - [1152, 128, 1, 784, 1152, 1152, 784, 784] + - [255, 3727.0] + - - [864, 96, 1, 1225, 864, 864, 1225, 1225] + - [245, 2957.0] + - - [896, 192, 1, 289, 896, 896, 289, 289] + - [241, 3062.0] + - - [768, 128, 1, 289, 768, 768, 289, 289] + - [220, 2569.0] + - - [1344, 192, 1, 289, 1344, 1344, 289, 289] + - [230, 3365.0] + - - [384, 192, 1, 1225, 384, 384, 1225, 1225] + - [232, 3051.0] + - - [832, 192, 1, 49, 832, 832, 49, 49] + - [255, 2302.0] + - - [1280, 192, 1, 64, 1280, 1280, 64, 64] + - [255, 3171.0] + - - [512, 256, 1, 196, 512, 512, 196, 196] + - [222, 2919.0] + - - [864, 96, 1, 289, 864, 864, 289, 289] + - [238, 2663.0] + - - [896, 128, 1, 289, 896, 896, 289, 289] + - [240, 3013.0] + - - [1200, 64, 1, 1225, 1200, 1200, 1225, 1225] + - [240, 3185.0] + - - [1024, 256, 1, 289, 1024, 1024, 289, 289] + - [255, 3453.0] + - - [1024, 256, 1, 196, 1024, 1024, 196, 196] + - [241, 3398.0] + - - [1120, 192, 1, 289, 1120, 1120, 289, 289] + - [255, 3539.0] + - - [800, 96, 1, 784, 800, 800, 784, 784] + - [220, 3032.0] + - - [864, 128, 1, 784, 864, 864, 784, 784] + - [253, 3162.0] + - - [1344, 224, 1, 289, 1344, 1344, 289, 289] + - [253, 3261.0] + - - [1152, 192, 1, 784, 1152, 1152, 784, 784] + - [222, 3794.0] + - - [800, 128, 1, 196, 800, 800, 196, 196] + - [240, 2509.0] + - - [864, 208, 1, 196, 864, 864, 196, 196] + - [240, 2873.0] + - - [720, 192, 1, 5041, 720, 720, 5041, 5041] + - [241, 3600.0] + - - [576, 192, 1, 3136, 576, 576, 3136, 3136] + - [253, 3304.0] + - - [832, 256, 1, 49, 832, 832, 49, 49] + - [255, 2509.0] + - - [1200, 128, 1, 49, 1200, 1200, 49, 49] + - [232, 2091.0] + - - [528, 256, 1, 196, 528, 528, 196, 196] + - [222, 3024.0] + - - [256, 512, 1, 784, 256, 256, 784, 784] + - [255, 3367.0] + - - [480, 192, 1, 196, 480, 480, 196, 196] + - [252, 2287.0] + - - [96, 64, 36, 2592, 96, 96, 2592, 2592] + - [252, 3075.0] + - - [96, 96, 36, 2592, 96, 96, 2592, 2592] + - [252, 3067.0] + - - [1024, 192, 1, 289, 1024, 1024, 289, 289] + - [220, 3225.0] + - - [528, 160, 1, 196, 528, 528, 196, 196] + - [252, 2075.0] + - - [512, 160, 1, 196, 512, 512, 196, 196] + - [240, 2533.0] + - - [768, 160, 1, 289, 768, 768, 289, 289] + - [240, 3194.0] + - - [64, 32, 36, 43808, 64, 64, 43808, 43808] + - [240, 2615.0] + - - [832, 160, 1, 49, 832, 832, 49, 49] + - [240, 1510.0] + - - [2048, 64, 1, 1001, 2048, 2048, 1001, 1001] + - [222, 3293.0] + - - [2048, 128, 1, 1001, 2048, 2048, 1001, 1001] + - [248, 3603.0] + - - [1536, 64, 1, 1001, 1536, 1536, 1001, 1001] + - [240, 2867.0] + - - [96, 96, 49, 3136, 96, 96, 3136, 3136] + - [252, 3341.0] + - - [64, 32, 49, 57600, 64, 64, 57600, 57600] + - [235, 2095.0] + - - [96, 64, 49, 6272, 96, 96, 6272, 6272] + - [263, 3018.0] + - - [64, 32, 49, 115200, 64, 64, 115200, 115200] + - [265, 2025.0] + - - [96, 96, 64, 2304, 96, 96, 2304, 2304] + - [262, 2851.0] + - - [96, 96, 49, 6272, 96, 96, 6272, 6272] + - [253, 2566.0] + - - [96, 64, 36, 5184, 96, 96, 5184, 5184] + - [218, 3068.0] + - - [64, 32, 64, 40000, 64, 64, 40000, 40000] + - [247, 2530.0] + - - [96, 64, 64, 4608, 96, 96, 4608, 4608] + - [248, 2742.0] + - - [96, 96, 36, 5184, 96, 96, 5184, 5184] + - [252, 3068.0] + - - [96, 64, 64, 2304, 96, 96, 2304, 2304] + - [248, 2993.0] + - - [96, 64, 49, 3136, 96, 96, 3136, 3136] + - [222, 3194.0] + - - [64, 32, 36, 87616, 64, 64, 87616, 87616] + - [228, 2571.0] + - - [64, 32, 64, 80000, 64, 64, 80000, 80000] + - [225, 2260.0] + - - [96, 96, 64, 4608, 96, 96, 4608, 4608] + - [263, 2437.0] + - - [64, 32, 36, 175232, 64, 64, 175232, 175232] + - [250, 2186.0] + - - [128, 128, 11, 3264, 128, 128, 3264, 3264] + - [220, 3255.0] + - - [192, 128, 11, 6528, 192, 192, 6528, 6528] + - [255, 3630.0] + - - [128, 128, 11, 6528, 128, 128, 6528, 6528] + - [220, 3245.0] + - - [160, 160, 9, 4896, 160, 160, 4896, 4896] + - [218, 3089.0] + - - [192, 160, 11, 6528, 192, 192, 6528, 6528] + - [241, 3580.0] + - - [192, 128, 9, 4896, 192, 192, 4896, 4896] + - [255, 3913.0] + - - [128, 128, 9, 4896, 128, 128, 4896, 4896] + - [263, 3877.0] + - - [192, 128, 11, 3264, 192, 192, 3264, 3264] + - [255, 3638.0] + - - [160, 160, 11, 3264, 160, 160, 3264, 3264] + - [252, 3238.0] + - - [192, 160, 9, 4896, 192, 192, 4896, 4896] + - [220, 3600.0] + - - [192, 160, 11, 3264, 192, 192, 3264, 3264] + - [255, 3613.0] + - - [160, 160, 11, 6528, 160, 160, 6528, 6528] + - [252, 3157.0] + - - [4096, 64, 1, 1024, 4096, 4096, 1024, 1024] + - [263, 3420.0] + - - [49, 49, 160, 64, 49, 49, 64, 64] + - [232, 2223.0] + - - [54, 54, 592, 64, 54, 54, 64, 64] + - [232, 2984.0] + - - [59, 59, 512, 64, 59, 59, 64, 64] + - [255, 3514.0] + - - [104, 104, 16, 64, 104, 104, 64, 64] + - [255, 1923.0] + - - [32, 32, 624, 64, 32, 32, 64, 64] + - [257, 2152.0] + - - [32, 32, 992, 64, 32, 32, 64, 64] + - [224, 2275.0] + - - [35, 35, 384, 64, 35, 35, 64, 64] + - [255, 1225.0] + - - [35, 35, 904, 64, 35, 35, 64, 64] + - [255, 1281.0] + - - [38, 38, 320, 64, 38, 38, 64, 64] + - [255, 1458.0] + - - [38, 38, 840, 64, 38, 38, 64, 64] + - [255, 1522.0] + - - [41, 41, 312, 64, 41, 41, 64, 64] + - [255, 1673.0] + - - [41, 41, 776, 64, 41, 41, 64, 64] + - [255, 1758.0] + - - [45, 45, 392, 64, 45, 45, 64, 64] + - [232, 2044.0] + - - [45, 45, 712, 64, 45, 45, 64, 64] + - [255, 2102.0] + - - [49, 49, 648, 64, 49, 49, 64, 64] + - [255, 2463.0] + - - [54, 54, 200, 64, 54, 54, 64, 64] + - [255, 2765.0] + - - [59, 59, 544, 64, 59, 59, 64, 64] + - [255, 3491.0] + - - [91, 91, 40, 64, 91, 91, 64, 64] + - [252, 2488.0] + - - [91, 93, 40, 64, 91, 91, 64, 64] + - [218, 2525.0] + - - [93, 93, 40, 64, 93, 93, 64, 64] + - [252, 2593.0] + - - [102, 102, 56, 64, 102, 102, 64, 64] + - [255, 2440.0] + - - [103, 103, 16, 64, 103, 103, 64, 64] + - [255, 1854.0] + - - [103, 104, 16, 64, 103, 103, 64, 64] + - [222, 1885.0] + - - [112, 112, 16, 64, 112, 112, 64, 64] + - [222, 2127.0] + - - [112, 123, 16, 64, 112, 112, 64, 64] + - [222, 2375.0] + - - [119, 119, 32, 64, 119, 119, 64, 64] + - [222, 2996.0] + - - [119, 135, 32, 64, 119, 119, 64, 64] + - [240, 2706.0] + - - [123, 123, 16, 64, 123, 123, 64, 64] + - [222, 2540.0] + - - [512, 512, 1, 512, 512, 512, 512, 512] + - [232, 3405.0] + - - [513, 512, 1, 512, 513, 513, 512, 512] + - [232, 3398.0] + - - [512, 512, 1, 513, 512, 512, 513, 513] + - [255, 3406.0] + - - [512, 512, 1, 511, 512, 512, 511, 511] + - [255, 3422.0] + - - [512, 513, 1, 512, 512, 512, 512, 512] + - [232, 3393.0] + - - [512, 511, 1, 512, 512, 512, 512, 512] + - [248, 3400.0] + - - [511, 512, 1, 512, 511, 511, 512, 512] + - [232, 3396.0] + - - [479, 512, 1, 512, 479, 479, 512, 512] + - [232, 3189.0] + - - [480, 511, 1, 512, 480, 480, 512, 512] + - [248, 3296.0] + - - [480, 512, 1, 511, 480, 480, 511, 511] + - [222, 3413.0] + - - [480, 512, 1, 513, 480, 480, 513, 513] + - [222, 3192.0] + - - [480, 513, 1, 512, 480, 480, 512, 512] + - [232, 3182.0] + - - [481, 512, 1, 512, 481, 481, 512, 512] + - [232, 3315.0] + - - [511, 480, 1, 512, 511, 511, 512, 512] + - [247, 3522.0] + - - [512, 479, 1, 512, 512, 512, 512, 512] + - [247, 3523.0] + - - [512, 480, 1, 511, 512, 512, 511, 511] + - [240, 3586.0] + - - [512, 480, 1, 513, 512, 512, 513, 513] + - [253, 3570.0] + - - [512, 481, 1, 512, 512, 512, 512, 512] + - [248, 3311.0] + - - [513, 480, 1, 512, 513, 513, 512, 512] + - [248, 3161.0] + - - [480, 512, 1, 512, 480, 480, 512, 512] + - [232, 3311.0] + - - [512, 480, 1, 512, 512, 512, 512, 512] + - [230, 3565.0] + - - [512, 512, 1, 64, 512, 512, 64, 64] + - [255, 2824.0] + - - [2048, 114, 1, 512, 2048, 2048, 512, 512] + - [248, 3136.0] + - - [2048, 114, 1, 768, 2048, 2048, 768, 768] + - [248, 3068.0] + - - [256, 684, 1, 1024, 256, 256, 1024, 1024] + - [248, 3299.0] + - - [33, 33, 1600, 32, 33, 33, 32, 32] + - [232, 1141.0] + - - [383, 384, 1, 384, 383, 383, 384, 384] + - [232, 3495.0] + - - [385, 384, 1, 384, 385, 385, 384, 384] + - [218, 2732.0] + - - [384, 383, 1, 384, 384, 384, 384, 384] + - [255, 3574.0] + - - [384, 385, 1, 384, 384, 384, 384, 384] + - [262, 3109.0] + - - [384, 384, 1, 383, 384, 384, 383, 383] + - [255, 3588.0] + - - [384, 384, 1, 385, 384, 384, 385, 385] + - [222, 3575.0] + - - [384, 384, 1, 384, 384, 384, 384, 384] + - [255, 3588.0] + - - [128, 64, 25, 6498, 128, 128, 6498, 6498] + - [230, 3508.0] + - - [128, 64, 25, 6859, 128, 128, 6859, 6859] + - [247, 3539.0] + - - [64, 64, 64, 3042, 64, 64, 3042, 3042] + - [248, 3581.0] + - - [64, 64, 64, 3211, 64, 64, 3211, 3211] + - [255, 3400.0] + - - [64, 64, 49, 4050, 64, 64, 4050, 4050] + - [240, 3434.0] + - - [64, 64, 49, 4275, 64, 64, 4275, 4275] + - [240, 3449.0] + - - [64, 64, 36, 6498, 64, 64, 6498, 6498] + - [232, 3745.0] + - - [64, 64, 36, 6859, 64, 64, 6859, 6859] + - [222, 3759.0] + - - [1152, 128, 1, 1444, 1152, 1152, 1444, 1444] + - [222, 3737.0] + - - [512, 256, 1, 361, 512, 512, 361, 361] + - [222, 3142.0] + - - [576, 128, 1, 1444, 576, 576, 1444, 1444] + - [253, 3161.0] + - - [1024, 308, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 4080.0] + - - [1024, 160, 1, 1024, 1024, 1024, 1024, 1024] + - [262, 3580.0] + - - [1024, 180, 1, 1024, 1024, 1024, 1024, 1024] + - [247, 3228.0] + - - [32, 32, 4608, 64, 32, 32, 64, 64] + - [234, 2486.0] + - - [32, 35, 4608, 64, 32, 32, 64, 64] + - [252, 1974.0] + - - [34, 34, 4736, 64, 34, 34, 64, 64] + - [222, 1250.0] + - - [35, 35, 4608, 64, 35, 35, 64, 64] + - [263, 1322.0] + - - [128, 864, 1, 256, 128, 128, 256, 256] + - [262, 2681.0] + - - [256, 864, 1, 512, 256, 256, 512, 512] + - [232, 3752.0] + - - [512, 256, 1, 784, 512, 512, 784, 784] + - [255, 3369.0] + - - [1024, 96, 1, 1024, 1024, 1024, 1024, 1024] + - [245, 2834.0] + - - [1024, 256, 1, 3800, 1024, 1024, 3800, 3800] + - [241, 3639.0] + - - [1024, 256, 1, 3400, 1024, 1024, 3400, 3400] + - [241, 3510.0] + - - [256, 1024, 1, 3400, 256, 256, 3400, 3400] + - [232, 3491.0] + - - [1024, 256, 1, 3220, 1024, 1024, 3220, 3220] + - [255, 3651.0] + - - [256, 1024, 1, 3220, 256, 256, 3220, 3220] + - [222, 3504.0] + - - [1024, 256, 1, 3456, 1024, 1024, 3456, 3456] + - [255, 3639.0] + - - [256, 1024, 1, 3456, 256, 256, 3456, 3456] + - [263, 3624.0] + - - [256, 1024, 1, 3072, 256, 256, 3072, 3072] + - [232, 3495.0] + - - [1024, 256, 1, 3552, 1024, 1024, 3552, 3552] + - [222, 3645.0] + - - [256, 1024, 1, 3552, 256, 256, 3552, 3552] + - [263, 3643.0] + - - [256, 1024, 1, 2852, 256, 256, 2852, 2852] + - [241, 3622.0] + - - [1024, 256, 1, 2852, 1024, 1024, 2852, 2852] + - [241, 3518.0] + - - [256, 512, 1, 10752, 256, 256, 10752, 10752] + - [248, 3434.0] + - - [256, 1024, 1, 3800, 256, 256, 3800, 3800] + - [232, 3495.0] + - - [256, 512, 1, 10560, 256, 256, 10560, 10560] + - [255, 3489.0] + - - [256, 1024, 1, 2992, 256, 256, 2992, 2992] + - [232, 3638.0] + - - [256, 1024, 1, 2688, 256, 256, 2688, 2688] + - [232, 3632.0] + - - [1024, 256, 1, 2688, 1024, 1024, 2688, 2688] + - [255, 3515.0] + - - [256, 1024, 1, 2904, 256, 256, 2904, 2904] + - [255, 3638.0] + - - [1024, 256, 1, 2904, 1024, 1024, 2904, 2904] + - [263, 3656.0] + - - [256, 1024, 1, 2640, 256, 256, 2640, 2640] + - [263, 3633.0] + - - [1024, 256, 1, 2640, 1024, 1024, 2640, 2640] + - [255, 3653.0] + - - [1024, 256, 1, 4032, 1024, 1024, 4032, 4032] + - [248, 3663.0] + - - [1024, 256, 1, 2992, 1024, 1024, 2992, 2992] + - [263, 3645.0] + - - [256, 1024, 1, 3360, 256, 256, 3360, 3360] + - [232, 3514.0] + - - [1024, 256, 1, 3360, 1024, 1024, 3360, 3360] + - [255, 3520.0] + - - [1024, 256, 1, 3500, 1024, 1024, 3500, 3500] + - [232, 3598.0] + - - [256, 1024, 1, 3500, 256, 256, 3500, 3500] + - [255, 3624.0] + - - [1024, 256, 1, 3168, 1024, 1024, 3168, 3168] + - [255, 3519.0] + - - [256, 1024, 1, 3168, 256, 256, 3168, 3168] + - [222, 3515.0] + - - [256, 1024, 1, 3036, 256, 256, 3036, 3036] + - [241, 3494.0] + - - [1024, 256, 1, 4200, 1024, 1024, 4200, 4200] + - [255, 3513.0] + - - [1024, 256, 1, 3600, 1024, 1024, 3600, 3600] + - [248, 3655.0] + - - [256, 1024, 1, 3600, 256, 256, 3600, 3600] + - [222, 3627.0] + - - [256, 1024, 1, 2944, 256, 256, 2944, 2944] + - [255, 3639.0] + - - [1024, 256, 1, 2944, 1024, 1024, 2944, 2944] + - [241, 3500.0] + - - [1024, 256, 1, 3700, 1024, 1024, 3700, 3700] + - [263, 3510.0] + - - [256, 1024, 1, 2352, 256, 256, 2352, 2352] + - [222, 3511.0] + - - [1024, 256, 1, 2352, 1024, 1024, 2352, 2352] + - [241, 3650.0] + - - [256, 1024, 1, 3700, 256, 256, 3700, 3700] + - [255, 3622.0] + - - [256, 1024, 1, 2816, 256, 256, 2816, 2816] + - [232, 3493.0] + - - [256, 512, 1, 11408, 256, 256, 11408, 11408] + - [241, 3474.0] + - - [1024, 256, 1, 3036, 1024, 1024, 3036, 3036] + - [263, 3653.0] + - - [1024, 256, 1, 3264, 1024, 1024, 3264, 3264] + - [241, 3648.0] + - - [256, 1024, 1, 3264, 256, 256, 3264, 3264] + - [241, 3652.0] + - - [1024, 256, 1, 3864, 1024, 1024, 3864, 3864] + - [248, 3655.0] + - - [256, 1024, 1, 4032, 256, 256, 4032, 4032] + - [232, 3645.0] + - - [1024, 256, 1, 3128, 1024, 1024, 3128, 3128] + - [248, 3649.0] + - - [256, 1024, 1, 3128, 256, 256, 3128, 3128] + - [241, 3627.0] + - - [256, 1024, 1, 3200, 256, 256, 3200, 3200] + - [232, 3496.0] + - - [256, 512, 1, 11616, 256, 256, 11616, 11616] + - [222, 3484.0] + - - [1024, 256, 1, 4000, 1024, 1024, 4000, 4000] + - [232, 3644.0] + - - [256, 1024, 1, 2520, 256, 256, 2520, 2520] + - [232, 3644.0] + - - [1024, 256, 1, 2520, 1024, 1024, 2520, 2520] + - [255, 3510.0] + - - [256, 1024, 1, 2976, 256, 256, 2976, 2976] + - [232, 3671.0] + - - [256, 1024, 1, 2400, 256, 256, 2400, 2400] + - [222, 3637.0] + - - [1024, 256, 1, 2400, 1024, 1024, 2400, 2400] + - [255, 3644.0] + - - [1024, 256, 1, 3696, 1024, 1024, 3696, 3696] + - [222, 3620.0] + - - [1024, 256, 1, 3900, 1024, 1024, 3900, 3900] + - [255, 3648.0] + - - [1024, 256, 1, 3772, 1024, 1024, 3772, 3772] + - [222, 3615.0] + - - [256, 1024, 1, 3696, 256, 256, 3696, 3696] + - [232, 3515.0] + - - [256, 1024, 1, 2728, 256, 256, 2728, 2728] + - [222, 3515.0] + - - [1024, 256, 1, 2728, 1024, 1024, 2728, 2728] + - [255, 3523.0] + - - [1024, 256, 1, 2480, 1024, 1024, 2480, 2480] + - [241, 3525.0] + - - [256, 1024, 1, 2480, 256, 256, 2480, 2480] + - [232, 3641.0] + - - [1024, 256, 1, 2880, 1024, 1024, 2880, 2880] + - [263, 3657.0] + - - [512, 256, 1, 3220, 512, 512, 3220, 3220] + - [255, 3453.0] + - - [256, 1024, 1, 2880, 256, 256, 2880, 2880] + - [248, 3635.0] + - - [256, 1024, 1, 4200, 256, 256, 4200, 4200] + - [232, 3512.0] + - - [1024, 256, 1, 3648, 1024, 1024, 3648, 3648] + - [255, 3699.0] + - - [1024, 256, 1, 3312, 1024, 1024, 3312, 3312] + - [248, 3697.0] + - - [256, 1024, 1, 3648, 256, 256, 3648, 3648] + - [263, 3643.0] + - - [1024, 256, 1, 3300, 1024, 1024, 3300, 3300] + - [248, 3636.0] + - - [1024, 256, 1, 3528, 1024, 1024, 3528, 3528] + - [248, 3517.0] + - - [256, 1024, 1, 2604, 256, 256, 2604, 2604] + - [222, 3504.0] + - - [1024, 256, 1, 2604, 1024, 1024, 2604, 2604] + - [241, 3509.0] + - - [512, 256, 1, 11408, 512, 512, 11408, 11408] + - [248, 3456.0] + - - [256, 1024, 1, 3312, 256, 256, 3312, 3312] + - [263, 3641.0] + - - [256, 1024, 1, 3300, 256, 256, 3300, 3300] + - [222, 3598.0] + - - [256, 1024, 1, 3528, 256, 256, 3528, 3528] + - [222, 3507.0] + - - [1024, 256, 1, 2976, 1024, 1024, 2976, 2976] + - [222, 3630.0] + - - [1024, 256, 1, 2760, 1024, 1024, 2760, 2760] + - [241, 3632.0] + - - [512, 256, 1, 3800, 512, 512, 3800, 3800] + - [255, 3478.0] + - - [256, 1024, 1, 2760, 256, 256, 2760, 2760] + - [232, 3502.0] + - - [1024, 256, 1, 2160, 1024, 1024, 2160, 2160] + - [241, 3519.0] + - - [256, 1024, 1, 2160, 256, 256, 2160, 2160] + - [232, 3634.0] + - - [512, 256, 1, 11616, 512, 512, 11616, 11616] + - [248, 3483.0] + - - [512, 256, 1, 2852, 512, 512, 2852, 2852] + - [255, 3419.0] + - - [256, 1024, 1, 3864, 256, 256, 3864, 3864] + - [222, 3489.0] + - - [512, 256, 1, 2640, 512, 512, 2640, 2640] + - [255, 3474.0] + - - [256, 1024, 1, 4000, 256, 256, 4000, 4000] + - [222, 3529.0] + - - [512, 256, 1, 2904, 512, 512, 2904, 2904] + - [255, 3455.0] + - - [256, 1024, 1, 3900, 256, 256, 3900, 3900] + - [255, 3503.0] + - - [512, 256, 1, 2688, 512, 512, 2688, 2688] + - [255, 3451.0] + - - [256, 1024, 1, 3772, 256, 256, 3772, 3772] + - [241, 3497.0] + - - [512, 256, 1, 3400, 512, 512, 3400, 3400] + - [222, 3407.0] + - - [512, 256, 1, 3456, 512, 512, 3456, 3456] + - [255, 3465.0] + - - [512, 256, 1, 3552, 512, 512, 3552, 3552] + - [255, 3497.0] + - - [29000, 35, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 2385.0] + - - [29000, 36, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 2455.0] + - - [29000, 39, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 2659.0] + - - [29000, 40, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 2726.0] + - - [29000, 42, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 2861.0] + - - [29000, 43, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 2927.0] + - - [29000, 44, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 2996.0] + - - [29000, 46, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 3135.0] + - - [29000, 48, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 3267.0] + - - [29000, 49, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 3337.0] + - - [29000, 50, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 3402.0] + - - [29000, 51, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 3472.0] + - - [29000, 53, 1, 2560, 29000, 29000, 2560, 2560] + - [248, 3607.0] + - - [29000, 54, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 3673.0] + - - [29000, 55, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 3742.0] + - - [29000, 56, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 3806.0] + - - [29000, 57, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 3878.0] + - - [29000, 58, 1, 2560, 29000, 29000, 2560, 2560] + - [232, 3941.0] + - - [29000, 59, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 4011.0] + - - [29000, 61, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 4147.0] + - - [29000, 63, 1, 2560, 29000, 29000, 2560, 2560] + - [263, 4275.0] + - - [288, 64, 1, 21609, 288, 288, 21609, 21609] + - [276, 2966.0] + - - [32, 32, 36, 43808, 32, 32, 43808, 43808] + - [270, 2466.0] + - - [32, 32, 64, 40000, 32, 32, 40000, 40000] + - [283, 2218.0] + - - [32, 32, 49, 115200, 32, 32, 115200, 115200] + - [278, 2027.0] + - - [32, 32, 36, 175232, 32, 32, 175232, 175232] + - [283, 1994.0] + - - [32, 32, 49, 57600, 32, 32, 57600, 57600] + - [274, 2083.0] + - - [32, 32, 36, 87616, 32, 32, 87616, 87616] + - [280, 2306.0] + - - [32, 32, 64, 80000, 32, 32, 80000, 80000] + - [274, 2031.0] + - - [256, 128, 1, 13600, 256, 256, 13600, 13600] + - [281, 3274.0] + - - [256, 128, 1, 12880, 256, 256, 12880, 12880] + - [290, 3239.0] + - - [128, 512, 1, 15200, 128, 128, 15200, 15200] + - [267, 3645.0] + - - [512, 128, 1, 15200, 512, 512, 15200, 15200] + - [277, 3629.0] + - - [128, 512, 1, 11408, 128, 128, 11408, 11408] + - [267, 3570.0] + - - [256, 128, 1, 13824, 256, 256, 13824, 13824] + - [285, 3245.0] + - - [128, 512, 1, 11616, 128, 128, 11616, 11616] + - [290, 3450.0] + - - [256, 128, 1, 14208, 256, 256, 14208, 14208] + - [277, 3250.0] + - - [128, 512, 1, 14208, 128, 128, 14208, 14208] + - [272, 3410.0] + - - [256, 128, 1, 15200, 256, 256, 15200, 15200] + - [281, 3287.0] + - - [512, 128, 1, 11408, 512, 512, 11408, 11408] + - [277, 3513.0] + - - [512, 128, 1, 16800, 512, 512, 16800, 16800] + - [281, 3651.0] + - - [128, 512, 1, 11264, 128, 128, 11264, 11264] + - [281, 3577.0] + - - [512, 128, 1, 11616, 512, 512, 11616, 11616] + - [285, 3621.0] + - - [512, 128, 1, 16128, 512, 512, 16128, 16128] + - [285, 3602.0] + - - [512, 128, 1, 11968, 512, 512, 11968, 11968] + - [285, 3596.0] + - - [128, 512, 1, 11968, 128, 128, 11968, 11968] + - [281, 3438.0] + - - [512, 128, 1, 12288, 512, 512, 12288, 12288] + - [272, 3337.0] + - - [128, 512, 1, 12288, 128, 128, 12288, 12288] + - [277, 3440.0] + - - [128, 512, 1, 12672, 128, 128, 12672, 12672] + - [281, 3616.0] + - - [512, 128, 1, 11776, 512, 512, 11776, 11776] + - [290, 3363.0] + - - [512, 128, 1, 12144, 512, 512, 12144, 12144] + - [285, 3395.0] + - - [512, 128, 1, 11264, 512, 512, 11264, 11264] + - [267, 3343.0] + - - [128, 512, 1, 12144, 128, 128, 12144, 12144] + - [272, 3599.0] + - - [512, 128, 1, 12672, 512, 512, 12672, 12672] + - [285, 3394.0] + - - [128, 512, 1, 12512, 128, 128, 12512, 12512] + - [277, 3449.0] + - - [128, 512, 1, 11776, 128, 128, 11776, 11776] + - [290, 3599.0] + - - [256, 128, 1, 12288, 256, 256, 12288, 12288] + - [290, 3182.0] + - - [40, 40, 1, 1909283, 40, 40, 1909283, 1909283] + - [269, 491.0] + - - [40, 40, 1, 3818566, 40, 40, 3818566, 3818566] + - [287, 492.0] + - - [30522, 20, 1, 1024, 30522, 30522, 1024, 1024] + - [303, 1464.0] + - - [1760, 32, 1, 1760, 1760, 1760, 1760, 1760] + - [297, 2206.0] + - - [3584, 4, 1, 1280, 3584, 3584, 1280, 1280] + - [293, 622.0] + - - [2944, 4, 1, 256, 2944, 2944, 256, 256] + - [301, 447.0] + - - [5056, 4, 1, 3328, 5056, 5056, 3328, 3328] + - [299, 719.0] + - - [1760, 16, 1, 1760, 1760, 1760, 1760, 1760] + - [239, 1593.0] + - - [2368, 4, 1, 1280, 2368, 2368, 1280, 1280] + - [293, 541.0] + - - [6784, 4, 1, 1280, 6784, 6784, 1280, 1280] + - [294, 633.0] + - - [1856, 4, 1, 1280, 1856, 1856, 1280, 1280] + - [298, 473.0] + - - [2944, 4, 1, 128, 2944, 2944, 128, 128] + - [301, 407.0] + - - [3584, 4, 1, 128, 3584, 3584, 128, 128] + - [293, 473.0] + - - [8448, 16, 1, 2816, 8448, 8448, 2816, 2816] + - [303, 2145.0] + - - [2368, 4, 1, 256, 2368, 2368, 256, 256] + - [293, 417.0] + - - [5888, 4, 1, 128, 5888, 5888, 128, 128] + - [298, 571.0] + - - [4288, 4, 1, 256, 4288, 4288, 256, 256] + - [293, 526.0] + - - [3584, 4, 1, 3328, 3584, 3584, 3328, 3328] + - [298, 679.0] + - - [2048, 16, 1, 2048, 2048, 2048, 2048, 2048] + - [261, 1665.0] + - - [1408, 4, 1, 256, 1408, 1408, 256, 256] + - [293, 294.0] + - - [4288, 4, 1, 3328, 4288, 4288, 3328, 3328] + - [299, 629.0] + - - [2368, 4, 1, 3328, 2368, 2368, 3328, 3328] + - [298, 600.0] + - - [5056, 4, 1, 1280, 5056, 5056, 1280, 1280] + - [302, 670.0] + - - [3072, 16, 1, 1024, 3072, 3072, 1024, 1024] + - [229, 1826.0] + - - [1408, 4, 1, 3328, 1408, 1408, 3328, 3328] + - [298, 402.0] + - - [6144, 16, 1, 2560, 6144, 6144, 2560, 2560] + - [295, 2292.0] + - - [4096, 16, 1, 4096, 4096, 4096, 4096, 4096] + - [303, 1893.0] + - - [1856, 4, 1, 256, 1856, 1856, 256, 256] + - [301, 373.0] + - - [6784, 4, 1, 128, 6784, 6784, 128, 128] + - [293, 573.0] + - - [4288, 4, 1, 128, 4288, 4288, 128, 128] + - [293, 511.0] + - - [5888, 4, 1, 3328, 5888, 5888, 3328, 3328] + - [293, 603.0] + - - [5056, 4, 1, 128, 5056, 5056, 128, 128] + - [301, 544.0] + - - [5888, 4, 1, 1280, 5888, 5888, 1280, 1280] + - [301, 633.0] + - - [2944, 4, 1, 3328, 2944, 2944, 3328, 3328] + - [298, 597.0] + - - [2368, 4, 1, 128, 2368, 2368, 128, 128] + - [293, 357.0] + - - [1856, 4, 1, 128, 1856, 1856, 128, 128] + - [293, 299.0] + - - [2560, 16, 1, 2560, 2560, 2560, 2560, 2560] + - [295, 2095.0] + - - [7680, 16, 1, 2560, 7680, 7680, 2560, 2560] + - [295, 2307.0] + - - [1408, 4, 1, 1280, 1408, 1408, 1280, 1280] + - [298, 369.0] + - - [6784, 4, 1, 256, 6784, 6784, 256, 256] + - [294, 548.0] + - - [1856, 4, 1, 3328, 1856, 1856, 3328, 3328] + - [293, 513.0] + - - [3584, 4, 1, 256, 3584, 3584, 256, 256] + - [301, 501.0] + - - [6784, 4, 1, 3328, 6784, 6784, 3328, 3328] + - [301, 599.0] + - - [2048, 32, 1, 2048, 2048, 2048, 2048, 2048] + - [303, 1939.0] + - - [1408, 4, 1, 128, 1408, 1408, 128, 128] + - [293, 217.0] + - - [5056, 4, 1, 256, 5056, 5056, 256, 256] + - [301, 544.0] + - - [4288, 4, 1, 1280, 4288, 4288, 1280, 1280] + - [301, 622.0] + - - [4608, 16, 1, 1536, 4608, 4608, 1536, 1536] + - [303, 2118.0] + - - [2944, 4, 1, 1280, 2944, 2944, 1280, 1280] + - [301, 571.0] + - - [5888, 4, 1, 256, 5888, 5888, 256, 256] + - [301, 524.0] + - - [2048, 32, 1, 1001, 2048, 2048, 1001, 1001] + - [300, 2095.0] + - - [1536, 32, 1, 1001, 1536, 1536, 1001, 1001] + - [297, 1925.0] + - - [1600, 1, 1, 1024, 1600, 1600, 1024, 1024] + - [293, 103.0] + - - [32768, 1, 1, 256, 32768, 32768, 256, 256] + - [296, 166.0] + - - [2048, 2, 1, 2048, 2048, 2048, 2048, 2048] + - [293, 265.0] + - - [2560, 4, 1, 2560, 2560, 2560, 2560, 2560] + - [298, 631.0] + - - [3456, 1, 1, 256, 3456, 3456, 256, 256] + - [301, 124.0] + - - [4096, 1, 1, 256, 4096, 4096, 256, 256] + - [301, 127.0] + - - [6912, 1, 1, 256, 6912, 6912, 256, 256] + - [294, 137.0] + - - [2048, 8, 1, 2048, 2048, 2048, 2048, 2048] + - [298, 1016.0] + - - [2560, 2, 1, 2560, 2560, 2560, 2560, 2560] + - [293, 318.0] + - - [29000, 27, 1, 2560, 29000, 29000, 2560, 2560] + - [246, 1722.0] + - - [4, 1856, 1, 3328, 4, 4, 3328, 3328] + - [244, 473.0] + - - [4, 1408, 1, 128, 4, 4, 128, 128] + - [304, 218.0] + - - [4, 2368, 1, 1280, 4, 4, 1280, 1280] + - [305, 540.0] + - - [4, 3584, 1, 128, 4, 4, 128, 128] + - [305, 437.0] + - - [4, 5888, 1, 3328, 4, 4, 3328, 3328] + - [305, 548.0] + - - [4, 1408, 1, 3328, 4, 4, 3328, 3328] + - [227, 378.0] + - - [4, 6784, 1, 3328, 4, 4, 3328, 3328] + - [305, 545.0] + - - [4, 4288, 1, 128, 4, 4, 128, 128] + - [304, 425.0] + - - [4, 6784, 1, 1280, 4, 4, 1280, 1280] + - [305, 568.0] + - - [4, 2944, 1, 3328, 4, 4, 3328, 3328] + - [305, 525.0] + - - [4, 5056, 1, 256, 4, 4, 256, 256] + - [305, 524.0] + - - [4, 5056, 1, 1280, 4, 4, 1280, 1280] + - [305, 621.0] + - - [4, 2368, 1, 3328, 4, 4, 3328, 3328] + - [304, 533.0] + - - [4, 1856, 1, 256, 4, 4, 256, 256] + - [227, 338.0] + - - [4, 2368, 1, 256, 4, 4, 256, 256] + - [304, 392.0] + - - [4, 2944, 1, 256, 4, 4, 256, 256] + - [305, 413.0] + - - [4, 4288, 1, 1280, 4, 4, 1280, 1280] + - [304, 581.0] + - - [4, 6784, 1, 128, 4, 4, 128, 128] + - [305, 500.0] + - - [4, 3584, 1, 1280, 4, 4, 1280, 1280] + - [305, 586.0] + - - [4, 5888, 1, 256, 4, 4, 256, 256] + - [307, 505.0] + - - [4, 6784, 1, 256, 4, 4, 256, 256] + - [307, 495.0] + - - [4, 1408, 1, 1280, 4, 4, 1280, 1280] + - [227, 361.0] + - - [4, 3584, 1, 256, 4, 4, 256, 256] + - [304, 469.0] + - - [4, 2944, 1, 1280, 4, 4, 1280, 1280] + - [304, 504.0] + - - [4, 1408, 1, 256, 4, 4, 256, 256] + - [227, 268.0] + - - [4, 4288, 1, 3328, 4, 4, 3328, 3328] + - [307, 585.0] + - - [4, 5888, 1, 1280, 4, 4, 1280, 1280] + - [305, 610.0] + - - [4, 1856, 1, 1280, 4, 4, 1280, 1280] + - [244, 457.0] + - - [4, 1856, 1, 128, 4, 4, 128, 128] + - [307, 276.0] + - - [4, 2944, 1, 128, 4, 4, 128, 128] + - [304, 386.0] + - - [4, 5056, 1, 3328, 4, 4, 3328, 3328] + - [307, 647.0] + - - [4, 5056, 1, 128, 4, 4, 128, 128] + - [305, 471.0] + - - [4, 4288, 1, 256, 4, 4, 256, 256] + - [307, 501.0] + - - [4, 3584, 1, 3328, 4, 4, 3328, 3328] + - [307, 605.0] + - - [4, 5888, 1, 128, 4, 4, 128, 128] + - [307, 501.0] + - - [4, 2368, 1, 128, 4, 4, 128, 128] + - [304, 355.0] + - - [32, 1600, 1, 512, 32, 32, 512, 512] + - [306, 1962.0] + - - [2, 2048, 1, 1024, 2, 2, 1024, 1024] + - [307, 238.0] + - - [1, 4096, 1, 256, 1, 1, 256, 256] + - [307, 125.0] + - - [1, 6912, 1, 256, 1, 1, 256, 256] + - [307, 128.0] + - - [2, 2048, 1, 768, 2, 2, 768, 768] + - [305, 226.0] + - - [2, 4608, 1, 768, 2, 2, 768, 768] + - [307, 291.0] + - - [2, 4608, 1, 1024, 2, 2, 1024, 1024] + - [305, 292.0] + - - [1024, 16, 1, 500000, 1024, 1024, 500000, 500000] + - [289, 1628.0] + - - [1024, 8, 1, 500000, 1024, 1024, 500000, 500000] + - [289, 839.0] + - - [512, 16, 1, 500000, 512, 512, 500000, 500000] + - [282, 1410.0] + - - [512, 8, 1, 500000, 512, 512, 500000, 500000] + - [273, 707.0] + - - [64, 80, 1, 5329, 64, 64, 5329, 5329] + - [268, 1168.0] + - - [576, 96, 1, 5329, 576, 576, 5329, 5329] + - [284, 3114.0] + - - [288, 32, 1, 21609, 288, 288, 21609, 21609] + - [288, 1824.0] + - - [576, 96, 1, 5041, 576, 576, 5041, 5041] + - [276, 3066.0] + - - [27, 32, 1, 22201, 27, 27, 22201, 22201] + - [292, 327.0] + - - [160, 64, 1, 5329, 160, 160, 5329, 5329] + - [275, 1799.0] + - - [448, 64, 1, 5329, 448, 448, 5329, 5329] + - [284, 3097.0] + - - [147, 64, 1, 12544, 147, 147, 12544, 12544] + - [271, 1712.0] + - - [147, 64, 1, 22500, 147, 147, 22500, 22500] + - [275, 1908.0] + - - [576, 64, 1, 5625, 576, 576, 5625, 5625] + - [276, 3015.0] + - - [256, 128, 1, 10752, 256, 256, 10752, 10752] + - [284, 2746.0] + - - [256, 128, 1, 10560, 256, 256, 10560, 10560] + - [266, 2970.0] + - - [256, 128, 1, 11408, 256, 256, 11408, 11408] + - [276, 3072.0] + - - [256, 12, 1, 11408, 256, 256, 11408, 11408] + - [268, 843.0] + - - [256, 128, 1, 11616, 256, 256, 11616, 11616] + - [276, 3038.0] + - - [256, 12, 1, 11616, 256, 256, 11616, 11616] + - [286, 833.0] + - - [256, 12, 1, 12288, 256, 256, 12288, 12288] + - [291, 843.0] + - - [11, 11, 1, 1909283, 11, 11, 1909283, 1909283] + - [279, 51.0] + - - [11, 11, 1, 3818566, 11, 11, 3818566, 3818566] + - [269, 51.0] + - - [768, 32, 1, 768, 768, 768, 768, 768] + - [227, 1441.0] + - - [768, 64, 1, 768, 768, 768, 768, 768] + - [218, 2027.0] + - - [1024, 80, 1, 1024, 1024, 1024, 1024, 1024] + - [260, 2386.0] + - - [1024, 20, 1, 1024, 1024, 1024, 1024, 1024] + - [227, 1118.0] + - - [768, 16, 1, 768, 768, 768, 768, 768] + - [233, 887.0] + - - [1024, 4, 1, 1024, 1024, 1024, 1024, 1024] + - [227, 292.0] + - - [1024, 6, 1, 1024, 1024, 1024, 1024, 1024] + - [244, 436.0] + - - [4, 704, 1, 1280, 4, 4, 1280, 1280] + - [233, 226.0] + - - [128, 64, 1, 256, 128, 128, 256, 256] + - [246, 530.0] + - - [128, 448, 1, 1280, 128, 128, 1280, 1280] + - [260, 2631.0] + - - [64, 4, 1, 256, 64, 64, 256, 256] + - [217, 15.0] + - - [64, 704, 1, 128, 64, 64, 128, 128] + - [217, 1428.0] + - - [448, 64, 1, 1280, 448, 448, 1280, 1280] + - [259, 1830.0] + - - [128, 4, 1, 1280, 128, 128, 1280, 1280] + - [223, 46.0] + - - [64, 1024, 1, 1280, 64, 64, 1280, 1280] + - [232, 2813.0] + - - [64, 704, 1, 1280, 64, 64, 1280, 1280] + - [260, 2099.0] + - - [1024, 64, 1, 128, 1024, 1024, 128, 128] + - [262, 1824.0] + - - [1024, 64, 1, 1280, 1024, 1024, 1280, 1280] + - [263, 2776.0] + - - [4, 704, 1, 256, 4, 4, 256, 256] + - [259, 160.0] + - - [704, 4, 1, 1280, 704, 704, 1280, 1280] + - [233, 224.0] + - - [448, 128, 1, 128, 448, 448, 128, 128] + - [260, 1638.0] + - - [256, 256, 1, 3328, 256, 256, 3328, 3328] + - [241, 2924.0] + - - [4, 64, 1, 1280, 4, 4, 1280, 1280] + - [217, 22.0] + - - [64, 64, 1, 3328, 64, 64, 3328, 3328] + - [236, 404.0] + - - [128, 256, 1, 3328, 128, 128, 3328, 3328] + - [259, 1874.0] + - - [64, 448, 1, 1280, 64, 64, 1280, 1280] + - [259, 1900.0] + - - [448, 4, 1, 256, 448, 448, 256, 256] + - [244, 106.0] + - - [128, 4, 1, 128, 128, 128, 128, 128] + - [237, 22.0] + - - [256, 4, 1, 128, 256, 256, 128, 128] + - [217, 43.0] + - - [704, 64, 1, 3328, 704, 704, 3328, 3328] + - [218, 2132.0] + - - [256, 64, 1, 1280, 256, 256, 1280, 1280] + - [244, 1306.0] + - - [704, 64, 1, 128, 704, 704, 128, 128] + - [244, 1435.0] + - - [1024, 4, 1, 256, 1024, 1024, 256, 256] + - [259, 224.0] + - - [256, 256, 1, 128, 256, 256, 128, 128] + - [262, 1864.0] + - - [64, 256, 1, 128, 64, 64, 128, 128] + - [227, 733.0] + - - [704, 64, 1, 1280, 704, 704, 1280, 1280] + - [218, 1997.0] + - - [128, 448, 1, 256, 128, 128, 256, 256] + - [232, 2033.0] + - - [128, 256, 1, 1280, 128, 128, 1280, 1280] + - [262, 1805.0] + - - [448, 64, 1, 3328, 448, 448, 3328, 3328] + - [251, 1906.0] + - - [256, 128, 1, 128, 256, 256, 128, 128] + - [259, 1212.0] + - - [64, 128, 1, 3328, 64, 64, 3328, 3328] + - [251, 750.0] + - - [128, 128, 1, 3328, 128, 128, 3328, 3328] + - [217, 1394.0] + - - [256, 128, 1, 256, 256, 256, 256, 256] + - [244, 1456.0] + - - [64, 448, 1, 3328, 64, 64, 3328, 3328] + - [244, 1984.0] + - - [1024, 4, 1, 3328, 1024, 1024, 3328, 3328] + - [256, 309.0] + - - [4, 4, 1, 256, 4, 4, 256, 256] + - [217, 1.0] + - - [256, 64, 1, 256, 256, 256, 256, 256] + - [227, 962.0] + - - [256, 128, 1, 1280, 256, 256, 1280, 1280] + - [259, 1799.0] + - - [128, 64, 1, 1280, 128, 128, 1280, 1280] + - [227, 713.0] + - - [4, 448, 1, 3328, 4, 4, 3328, 3328] + - [251, 165.0] + - - [64, 1024, 1, 256, 64, 64, 256, 256] + - [232, 2292.0] + - - [256, 4, 1, 1280, 256, 256, 1280, 1280] + - [233, 93.0] + - - [64, 704, 1, 256, 64, 64, 256, 256] + - [244, 1677.0] + - - [4, 704, 1, 128, 4, 4, 128, 128] + - [227, 122.0] + - - [448, 128, 1, 256, 448, 448, 256, 256] + - [245, 2050.0] + - - [448, 64, 1, 128, 448, 448, 128, 128] + - [259, 1147.0] + - - [4, 1024, 1, 1280, 4, 4, 1280, 1280] + - [259, 301.0] + - - [4, 448, 1, 1280, 4, 4, 1280, 1280] + - [227, 154.0] + - - [448, 4, 1, 1280, 448, 448, 1280, 1280] + - [227, 152.0] + - - [256, 256, 1, 256, 256, 256, 256, 256] + - [232, 2298.0] + - - [256, 64, 1, 128, 256, 256, 128, 128] + - [219, 694.0] + - - [4, 1024, 1, 3328, 4, 4, 3328, 3328] + - [244, 321.0] + - - [64, 128, 1, 128, 64, 64, 128, 128] + - [246, 394.0] + - - [704, 4, 1, 128, 704, 704, 128, 128] + - [227, 119.0] + - - [256, 4, 1, 256, 256, 256, 256, 256] + - [227, 60.0] + - - [256, 4, 1, 3328, 256, 256, 3328, 3328] + - [249, 102.0] + - - [4, 256, 1, 256, 4, 4, 256, 256] + - [227, 60.0] + - - [4, 4, 1, 128, 4, 4, 128, 128] + - [217, 1.0] + - - [4, 128, 1, 256, 4, 4, 256, 256] + - [227, 30.0] + - - [64, 64, 1, 1280, 64, 64, 1280, 1280] + - [249, 364.0] + - - [448, 128, 1, 3328, 448, 448, 3328, 3328] + - [245, 2623.0] + - - [64, 448, 1, 256, 64, 64, 256, 256] + - [227, 1417.0] + - - [4, 448, 1, 128, 4, 4, 128, 128] + - [219, 75.0] + - - [64, 256, 1, 1280, 64, 64, 1280, 1280] + - [259, 1339.0] + - - [64, 128, 1, 1280, 64, 64, 1280, 1280] + - [227, 710.0] + - - [64, 4, 1, 128, 64, 64, 128, 128] + - [217, 11.0] + - - [64, 64, 1, 256, 64, 64, 256, 256] + - [227, 266.0] + - - [4, 704, 1, 3328, 4, 4, 3328, 3328] + - [256, 234.0] + - - [4, 4, 1, 1280, 4, 4, 1280, 1280] + - [217, 1.0] + - - [128, 128, 1, 128, 128, 128, 128, 128] + - [229, 723.0] + - - [1024, 4, 1, 128, 1024, 1024, 128, 128] + - [261, 170.0] + - - [4, 64, 1, 128, 4, 4, 128, 128] + - [217, 11.0] + - - [64, 1024, 1, 128, 64, 64, 128, 128] + - [262, 1889.0] + - - [128, 128, 1, 1280, 128, 128, 1280, 1280] + - [244, 1307.0] + - - [128, 256, 1, 256, 128, 128, 256, 256] + - [259, 1477.0] + - - [64, 128, 1, 256, 64, 64, 256, 256] + - [227, 530.0] + - - [1024, 4, 1, 1280, 1024, 1024, 1280, 1280] + - [256, 291.0] + - - [704, 64, 1, 256, 704, 704, 256, 256] + - [259, 1667.0] + - - [128, 64, 1, 3328, 128, 128, 3328, 3328] + - [227, 754.0] + - - [448, 64, 1, 256, 448, 448, 256, 256] + - [259, 1445.0] + - - [4, 256, 1, 128, 4, 4, 128, 128] + - [219, 44.0] + - - [1024, 64, 1, 256, 1024, 1024, 256, 256] + - [230, 2237.0] + - - [4, 4, 1, 3328, 4, 4, 3328, 3328] + - [223, 2.0] + - - [704, 4, 1, 256, 704, 704, 256, 256] + - [233, 159.0] + - - [128, 4, 1, 3328, 128, 128, 3328, 3328] + - [223, 51.0] + - - [64, 1024, 1, 3328, 64, 64, 3328, 3328] + - [232, 2928.0] + - - [448, 4, 1, 3328, 448, 448, 3328, 3328] + - [227, 162.0] + - - [4, 128, 1, 3328, 4, 4, 3328, 3328] + - [243, 50.0] + - - [704, 4, 1, 3328, 704, 704, 3328, 3328] + - [242, 240.0] + - - [448, 128, 1, 1280, 448, 448, 1280, 1280] + - [245, 2561.0] + - - [1024, 64, 1, 3328, 1024, 1024, 3328, 3328] + - [248, 2908.0] + - - [4, 1024, 1, 128, 4, 4, 128, 128] + - [227, 175.0] + - - [64, 256, 1, 3328, 64, 64, 3328, 3328] + - [251, 1384.0] + - - [128, 256, 1, 128, 128, 128, 128, 128] + - [259, 1212.0] + - - [128, 4, 1, 256, 128, 128, 256, 256] + - [227, 30.0] + - - [256, 256, 1, 1280, 256, 256, 1280, 1280] + - [232, 2815.0] + - - [256, 128, 1, 3328, 256, 256, 3328, 3328] + - [259, 1888.0] + - - [448, 4, 1, 128, 448, 448, 128, 128] + - [244, 75.0] + - - [4, 256, 1, 3328, 4, 4, 3328, 3328] + - [226, 100.0] + - - [4, 128, 1, 128, 4, 4, 128, 128] + - [219, 22.0] + - - [4, 256, 1, 1280, 4, 4, 1280, 1280] + - [236, 89.0] + - - [64, 4, 1, 3328, 64, 64, 3328, 3328] + - [223, 25.0] + - - [4, 64, 1, 3328, 4, 4, 3328, 3328] + - [226, 25.0] + - - [4, 1024, 1, 256, 4, 4, 256, 256] + - [227, 228.0] + - - [64, 256, 1, 256, 64, 64, 256, 256] + - [244, 940.0] + - - [4, 64, 1, 256, 4, 4, 256, 256] + - [227, 15.0] + - - [128, 448, 1, 128, 128, 128, 128, 128] + - [262, 1646.0] + - - [64, 448, 1, 128, 64, 64, 128, 128] + - [227, 1154.0] + - - [64, 704, 1, 3328, 64, 64, 3328, 3328] + - [260, 2195.0] + - - [128, 448, 1, 3328, 128, 128, 3328, 3328] + - [260, 2736.0] + - - [4, 448, 1, 256, 4, 4, 256, 256] + - [244, 102.0] + - - [4, 128, 1, 1280, 4, 4, 1280, 1280] + - [236, 45.0] + - - [128, 64, 1, 128, 128, 128, 128, 128] + - [246, 372.0] + - - [64, 64, 1, 128, 64, 64, 128, 128] + - [246, 193.0] + - - [64, 4, 1, 1280, 64, 64, 1280, 1280] + - [223, 23.0] + - - [256, 64, 1, 3328, 256, 256, 3328, 3328] + - [219, 1350.0] + - - [128, 128, 1, 256, 128, 128, 256, 256] + - [244, 962.0] + - - [256, 64, 1, 3136, 256, 256, 3136, 3136] + - [217, 1450.0] + - - [64, 200, 1, 1024, 64, 64, 1024, 1024] + - [259, 1040.0] + - - [32, 512, 1, 1024, 32, 32, 1024, 1024] + - [259, 1273.0] + - - [1, 512, 1, 1024, 1, 1, 1024, 1024] + - [227, 43.0] + - - [128, 512, 1, 2048, 128, 128, 2048, 2048] + - [232, 2795.0] + - - [64, 256, 1, 1024, 64, 64, 1024, 1024] + - [259, 1297.0] + - - [1, 200, 1, 1024, 1, 1, 1024, 1024] + - [217, 17.0] + - - [128, 512, 1, 1024, 128, 128, 1024, 1024] + - [232, 2789.0] + - - [32, 256, 1, 2048, 32, 32, 2048, 2048] + - [227, 733.0] + - - [32, 256, 1, 512, 32, 32, 512, 512] + - [227, 630.0] + - - [256, 200, 1, 1024, 256, 256, 1024, 1024] + - [245, 2272.0] + - - [1, 256, 1, 2048, 1, 1, 2048, 2048] + - [226, 24.0] + - - [32, 200, 1, 2048, 32, 32, 2048, 2048] + - [227, 575.0] + - - [128, 200, 1, 1024, 128, 128, 1024, 1024] + - [259, 1651.0] + - - [128, 256, 1, 2048, 128, 128, 2048, 2048] + - [262, 1835.0] + - - [64, 1024, 1, 1024, 64, 64, 1024, 1024] + - [248, 2787.0] + - - [1, 512, 1, 2048, 1, 1, 2048, 2048] + - [227, 46.0] + - - [128, 256, 1, 512, 128, 128, 512, 512] + - [244, 1648.0] + - - [128, 200, 1, 2048, 128, 128, 2048, 2048] + - [217, 1743.0] + - - [64, 200, 1, 512, 64, 64, 512, 512] + - [259, 918.0] + - - [1, 256, 1, 1024, 1, 1, 1024, 1024] + - [217, 21.0] + - - [1, 1024, 1, 1024, 1, 1, 1024, 1024] + - [227, 75.0] + - - [256, 256, 1, 2048, 256, 256, 2048, 2048] + - [232, 2844.0] + - - [128, 256, 1, 1024, 128, 128, 1024, 1024] + - [259, 1777.0] + - - [1, 256, 1, 4096, 1, 1, 4096, 4096] + - [226, 25.0] + - - [32, 512, 1, 512, 32, 32, 512, 512] + - [259, 1146.0] + - - [64, 200, 1, 2048, 64, 64, 2048, 2048] + - [259, 1119.0] + - - [1, 200, 1, 2048, 1, 1, 2048, 2048] + - [226, 19.0] + - - [1, 512, 1, 4096, 1, 1, 4096, 4096] + - [227, 47.0] + - - [256, 256, 1, 1024, 256, 256, 1024, 1024] + - [232, 2789.0] + - - [64, 256, 1, 2048, 64, 64, 2048, 2048] + - [259, 1400.0] + - - [1, 200, 1, 4096, 1, 1, 4096, 4096] + - [226, 20.0] + - - [32, 256, 1, 1024, 32, 32, 1024, 1024] + - [227, 693.0] + - - [32, 200, 1, 1024, 32, 32, 1024, 1024] + - [227, 543.0] + - - [32, 512, 1, 2048, 32, 32, 2048, 2048] + - [259, 1322.0] + - - [128, 200, 1, 512, 128, 128, 512, 512] + - [217, 1496.0] + - - [64, 1024, 1, 2048, 64, 64, 2048, 2048] + - [247, 2861.0] + - - [1, 1024, 1, 2048, 1, 1, 2048, 2048] + - [227, 79.0] + - - [32, 1024, 1, 512, 32, 32, 512, 512] + - [259, 1678.0] + - - [64, 1024, 1, 512, 64, 64, 512, 512] + - [232, 2626.0] + - - [1, 1024, 1, 4096, 1, 1, 4096, 4096] + - [227, 82.0] + - - [64, 256, 1, 512, 64, 64, 512, 512] + - [244, 1155.0] + - - [256, 200, 1, 512, 256, 256, 512, 512] + - [245, 2117.0] + - - [32, 1024, 1, 1024, 32, 32, 1024, 1024] + - [259, 1785.0] + - - [32, 200, 1, 512, 32, 32, 512, 512] + - [227, 486.0] + - - [256, 256, 1, 512, 256, 256, 512, 512] + - [232, 2585.0] + - - [128, 512, 1, 512, 128, 128, 512, 512] + - [232, 2581.0] + - - [256, 200, 1, 2048, 256, 256, 2048, 2048] + - [245, 2353.0] + - - [64, 512, 1, 2048, 64, 64, 2048, 2048] + - [244, 1833.0] + - - [32, 1024, 1, 2048, 32, 32, 2048, 2048] + - [259, 1865.0] + - - [256, 64, 1, 1225, 256, 256, 1225, 1225] + - [256, 1190.0] + - - [384, 64, 1, 1225, 384, 384, 1225, 1225] + - [244, 1590.0] + - - [288, 64, 1, 1225, 288, 288, 1225, 1225] + - [256, 1301.0] + - - [384, 96, 1, 1225, 384, 384, 1225, 1225] + - [217, 1883.0] + - - [11, 11, 5456, 64, 11, 11, 64, 64] + - [244, 1019.0] + - - [14, 14, 4368, 64, 14, 14, 64, 64] + - [244, 1599.0] + - - [23, 23, 2720, 64, 23, 23, 64, 64] + - [252, 1770.0] + - - [13, 13, 4672, 64, 13, 13, 64, 64] + - [217, 1404.0] + - - [29, 29, 2176, 64, 29, 29, 64, 64] + - [260, 1974.0] + - - [12, 12, 5040, 64, 12, 12, 64, 64] + - [227, 1200.0] + - - [27, 27, 2336, 64, 27, 27, 64, 64] + - [245, 1924.0] + - - [10, 10, 5952, 64, 10, 10, 64, 64] + - [227, 843.0] + - - [7, 7, 8192, 64, 7, 7, 64, 64] + - [217, 424.0] + - - [16, 16, 3840, 64, 16, 16, 64, 64] + - [259, 2015.0] + - - [17, 17, 3632, 64, 17, 17, 64, 64] + - [218, 1008.0] + - - [9, 9, 6544, 64, 9, 9, 64, 64] + - [251, 688.0] + - - [8, 8, 7280, 64, 8, 8, 64, 64] + - [259, 550.0] + - - [21, 21, 2976, 64, 21, 21, 64, 64] + - [245, 1499.0] + - - [19, 19, 3264, 64, 19, 19, 64, 64] + - [245, 1221.0] + - - [25, 25, 2512, 64, 25, 25, 64, 64] + - [260, 1998.0] + - - [18, 18, 3440, 64, 18, 18, 64, 64] + - [218, 1096.0] + - - [15, 15, 4096, 64, 15, 15, 64, 64] + - [259, 1822.0] + - - [2, 16, 1, 768, 2, 2, 768, 768] + - [217, 3.0] + - - [2, 8, 1, 768, 2, 2, 768, 768] + - [217, 1.0] + - - [2, 64, 1, 768, 2, 2, 768, 768] + - [217, 10.0] + - - [256, 128, 1, 784, 256, 256, 784, 784] + - [217, 1809.0] + - - [192, 48, 1, 1225, 192, 192, 1225, 1225] + - [233, 760.0] + - - [64, 256, 1, 3136, 64, 64, 3136, 3136] + - [251, 1451.0] + - - [512, 144, 1, 196, 512, 512, 196, 196] + - [253, 2223.0] + - - [400, 32, 1, 784, 400, 400, 784, 784] + - [223, 1000.0] + - - [832, 48, 1, 49, 832, 832, 49, 49] + - [217, 874.0] + - - [192, 32, 1, 784, 192, 192, 784, 784] + - [233, 491.0] + - - [288, 48, 1, 1225, 288, 288, 1225, 1225] + - [223, 1120.0] + - - [512, 112, 1, 196, 512, 512, 196, 196] + - [262, 1825.0] + - - [528, 32, 1, 196, 528, 528, 196, 196] + - [223, 796.0] + - - [576, 64, 1, 3136, 576, 576, 3136, 3136] + - [240, 2185.0] + - - [480, 64, 1, 196, 480, 480, 196, 196] + - [227, 1368.0] + - - [192, 64, 1, 784, 192, 192, 784, 784] + - [233, 956.0] + - - [192, 32, 1, 1225, 192, 192, 1225, 1225] + - [233, 511.0] + - - [400, 48, 1, 196, 400, 400, 196, 196] + - [259, 909.0] + - - [480, 16, 1, 196, 480, 480, 196, 196] + - [233, 394.0] + - - [512, 64, 1, 196, 512, 512, 196, 196] + - [227, 1361.0] + - - [800, 64, 1, 196, 800, 800, 196, 196] + - [251, 1742.0] + - - [512, 128, 1, 784, 512, 512, 784, 784] + - [232, 2792.0] + - - [256, 64, 1, 784, 256, 256, 784, 784] + - [217, 1157.0] + - - [256, 48, 1, 1225, 256, 256, 1225, 1225] + - [233, 997.0] + - - [192, 16, 1, 784, 192, 192, 784, 784] + - [249, 252.0] + - - [576, 96, 1, 1225, 576, 576, 1225, 1225] + - [218, 2397.0] + - - [512, 128, 1, 196, 512, 512, 196, 196] + - [220, 2072.0] + - - [192, 96, 1, 784, 192, 192, 784, 784] + - [259, 1323.0] + - - [192, 64, 1, 1225, 192, 192, 1225, 1225] + - [223, 992.0] + - - [512, 32, 1, 196, 512, 512, 196, 196] + - [223, 750.0] + - - [528, 128, 1, 196, 528, 528, 196, 196] + - [240, 2083.0] + - - [128, 512, 1, 784, 128, 128, 784, 784] + - [232, 2798.0] + - - [64, 64, 1, 3136, 64, 64, 3136, 3136] + - [226, 399.0] + - - [256, 32, 1, 784, 256, 256, 784, 784] + - [233, 650.0] + - - [480, 96, 1, 196, 480, 480, 196, 196] + - [217, 1568.0] + - - [1024, 32, 1, 1001, 1024, 1024, 1001, 1001] + - [251, 1707.0] + - - [18, 18, 648, 64, 18, 18, 64, 64] + - [218, 944.0] + - - [7, 7, 736, 64, 7, 7, 64, 64] + - [217, 338.0] + - - [8, 8, 264, 64, 8, 8, 64, 64] + - [217, 330.0] + - - [9, 9, 416, 64, 9, 9, 64, 64] + - [217, 488.0] + - - [10, 10, 448, 64, 10, 10, 64, 64] + - [217, 597.0] + - - [11, 11, 568, 64, 11, 11, 64, 64] + - [259, 758.0] + - - [12, 12, 480, 64, 12, 12, 64, 64] + - [259, 878.0] + - - [12, 12, 2520, 64, 12, 12, 64, 64] + - [227, 1164.0] + - - [13, 13, 576, 64, 13, 13, 64, 64] + - [259, 1045.0] + - - [13, 13, 2336, 64, 13, 13, 64, 64] + - [259, 1341.0] + - - [14, 14, 704, 64, 14, 14, 64, 64] + - [251, 1144.0] + - - [14, 14, 2184, 64, 14, 14, 64, 64] + - [259, 1532.0] + - - [15, 15, 688, 64, 15, 15, 64, 64] + - [223, 1280.0] + - - [15, 15, 2048, 64, 15, 15, 64, 64] + - [217, 1707.0] + - - [16, 16, 712, 64, 16, 16, 64, 64] + - [249, 1344.0] + - - [16, 16, 1920, 64, 16, 16, 64, 64] + - [259, 1837.0] + - - [17, 17, 688, 64, 17, 17, 64, 64] + - [228, 851.0] + - - [17, 17, 1816, 64, 17, 17, 64, 64] + - [218, 974.0] + - - [18, 18, 1720, 64, 18, 18, 64, 64] + - [252, 1078.0] + - - [19, 19, 680, 64, 19, 19, 64, 64] + - [252, 1053.0] + - - [19, 19, 1632, 64, 19, 19, 64, 64] + - [245, 1193.0] + - - [21, 21, 1472, 64, 21, 21, 64, 64] + - [238, 1407.0] + - - [21, 21, 1488, 64, 21, 21, 64, 64] + - [252, 1459.0] + - - [23, 23, 64, 64, 23, 23, 64, 64] + - [231, 653.0] + - - [23, 23, 1360, 64, 23, 23, 64, 64] + - [245, 1713.0] + - - [25, 25, 176, 64, 25, 25, 64, 64] + - [252, 1289.0] + - - [25, 25, 1256, 64, 25, 25, 64, 64] + - [245, 1844.0] + - - [26, 26, 56, 64, 26, 26, 64, 64] + - [221, 824.0] + - - [26, 27, 56, 64, 26, 26, 64, 64] + - [217, 833.0] + - - [27, 27, 56, 64, 27, 27, 64, 64] + - [254, 871.0] + - - [27, 27, 1168, 64, 27, 27, 64, 64] + - [228, 1868.0] + - - [29, 29, 136, 64, 29, 29, 64, 64] + - [252, 1506.0] + - - [29, 29, 1088, 64, 29, 29, 64, 64] + - [228, 1938.0] + - - [256, 1, 1, 4, 256, 256, 4, 4] + - [217, 1.0] + - - [2, 1, 1, 1024, 2, 2, 1024, 1024] + - [223, 0.16] + - - [1024, 1, 1, 1024, 1024, 1024, 1024, 1024] + - [227, 73.0] + - - [2, 6, 1, 1024, 2, 2, 1024, 1024] + - [217, 1.0] + - - [2, 8, 1, 1024, 2, 2, 1024, 1024] + - [217, 1.0] + - - [14, 14, 1, 64, 14, 14, 64, 64] + - [217, 5.0] + - - [15, 14, 1, 64, 15, 15, 64, 64] + - [217, 6.0] + - - [15, 15, 1, 64, 15, 15, 64, 64] + - [217, 6.0] + - - [17, 15, 1, 64, 17, 17, 64, 64] + - [261, 8.0] + - - [17, 17, 1, 64, 17, 17, 64, 64] + - [217, 8.0] + - - [30, 30, 1, 64, 30, 30, 64, 64] + - [217, 25.0] + - - [30, 31, 1, 64, 30, 30, 64, 64] + - [237, 26.0] + - - [31, 31, 1, 64, 31, 31, 64, 64] + - [219, 27.0] + - - [1024, 32, 1, 1024, 1024, 1024, 1024, 1024] + - [244, 1746.0] + - - [2, 32, 1, 1024, 2, 2, 1024, 1024] + - [217, 5.0] + - - [2, 4, 1, 1024, 2, 2, 1024, 1024] + - [217, 1.0] + - - [64, 512, 1, 512, 64, 64, 512, 512] + - [259, 1671.0] + - - [64, 960, 1, 1024, 64, 64, 1024, 1024] + - [262, 2608.0] + - - [200, 1, 1, 1024, 200, 200, 1024, 1024] + - [249, 18.0] + - - [512, 1, 1, 2048, 512, 512, 2048, 2048] + - [227, 45.0] + - - [64, 512, 1, 1024, 64, 64, 1024, 1024] + - [230, 1791.0] + - - [3, 3, 512, 64, 3, 3, 64, 64] + - [217, 60.0] + - - [5, 5, 512, 64, 5, 5, 64, 64] + - [217, 166.0] + - - [9, 9, 512, 64, 9, 9, 64, 64] + - [217, 518.0] + - - [128, 256, 1, 1444, 128, 128, 1444, 1444] + - [251, 1850.0] + - - [256, 128, 1, 25, 256, 256, 25, 25] + - [221, 445.0] + - - [256, 128, 1, 9, 256, 256, 9, 9] + - [239, 187.0] + - - [256, 256, 1, 1444, 256, 256, 1444, 1444] + - [240, 2830.0] + - - [512, 128, 1, 100, 512, 512, 100, 100] + - [262, 1583.0] + - - [64, 128, 1, 1444, 64, 64, 1444, 1444] + - [233, 685.0] + - - [1024, 77, 1, 1024, 1024, 1024, 1024, 1024] + - [245, 2320.0] + - - [2, 10, 1, 1024, 2, 2, 1024, 1024] + - [217, 2.0] + - - [1024, 10, 1, 1024, 1024, 1024, 1024, 1024] + - [227, 716.0] + - - [2, 39, 1, 1024, 2, 2, 1024, 1024] + - [223, 7.0] + - - [1024, 39, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 1648.0] + - - [2, 40, 1, 1024, 2, 2, 1024, 1024] + - [217, 7.0] + - - [1024, 40, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 1682.0] + - - [2, 41, 1, 1024, 2, 2, 1024, 1024] + - [217, 7.0] + - - [1024, 41, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 1724.0] + - - [2, 5, 1, 1024, 2, 2, 1024, 1024] + - [217, 1.0] + - - [1024, 5, 1, 1024, 1024, 1024, 1024, 1024] + - [259, 358.0] + - - [1024, 8, 1, 1024, 1024, 1024, 1024, 1024] + - [244, 573.0] + - - [2, 9, 1, 1024, 2, 2, 1024, 1024] + - [223, 2.0] + - - [1024, 9, 1, 1024, 1024, 1024, 1024, 1024] + - [264, 643.0] + - - [4, 4, 32768, 64, 4, 4, 64, 64] + - [244, 141.0] + - - [4, 4, 38400, 64, 4, 4, 64, 64] + - [217, 140.0] + - - [14, 14, 10880, 64, 14, 14, 64, 64] + - [217, 1607.0] + - - [15, 14, 10880, 64, 15, 15, 64, 64] + - [217, 1647.0] + - - [15, 15, 7680, 64, 15, 15, 64, 64] + - [259, 1880.0] + - - [15, 15, 10880, 64, 15, 15, 64, 64] + - [217, 1729.0] + - - [17, 15, 7680, 64, 17, 17, 64, 64] + - [251, 1099.0] + - - [17, 17, 6144, 64, 17, 17, 64, 64] + - [260, 1004.0] + - - [17, 17, 7680, 64, 17, 17, 64, 64] + - [245, 1026.0] + - - [21, 17, 6144, 64, 21, 21, 64, 64] + - [260, 1259.0] + - - [21, 21, 6144, 64, 21, 21, 64, 64] + - [252, 1561.0] + - - [24, 24, 4736, 64, 24, 24, 64, 64] + - [228, 1949.0] + - - [30, 30, 2048, 64, 30, 30, 64, 64] + - [234, 2116.0] + - - [30, 31, 2048, 64, 30, 30, 64, 64] + - [257, 2185.0] + - - [31, 31, 2048, 64, 31, 31, 64, 64] + - [234, 2248.0] + - - [34, 24, 4736, 64, 34, 34, 64, 64] + - [262, 1602.0] + - - [128, 128, 1, 64, 128, 128, 64, 64] + - [231, 419.0] + - - [2, 1024, 1, 1024, 2, 2, 1024, 1024] + - [259, 149.0] + - - [5, 5, 1, 64, 5, 5, 64, 64] + - [217, 1.0] + - - [33, 33, 1, 32, 33, 33, 32, 32] + - [219, 19.0] + - - [5, 5, 960, 64, 5, 5, 64, 64] + - [217, 192.0] + - - [27, 27, 32768, 128, 27, 27, 128, 128] + - [234, 1826.0] + - - [960, 1, 1, 2048, 960, 960, 2048, 2048] + - [227, 70.0] + - - [2, 2, 1, 2048, 2, 2, 2048, 2048] + - [223, 0.38] + - - [1024, 16, 1, 1024, 1024, 1024, 1024, 1024] + - [227, 1140.0] + - - [2, 16, 1, 1024, 2, 2, 1024, 1024] + - [217, 3.0] + - - [2, 4, 1, 2560, 2, 2, 2560, 2560] + - [217, 1.0] + - - [1024, 64, 1, 1024, 1024, 1024, 1024, 1024] + - [232, 2708.0] + - - [2, 64, 1, 1024, 2, 2, 1024, 1024] + - [217, 10.0] + - - [864, 1, 1, 256, 864, 864, 256, 256] + - [227, 49.0] + - - [2, 80, 1, 1024, 2, 2, 1024, 1024] + - [219, 13.0] + - - [1024, 82, 1, 1024, 1024, 1024, 1024, 1024] + - [260, 2450.0] + - - [2, 82, 1, 1024, 2, 2, 1024, 1024] + - [217, 14.0] + - - [1024, 12, 1, 1024, 1024, 1024, 1024, 1024] + - [233, 861.0] + - - [2, 12, 1, 1024, 2, 2, 1024, 1024] + - [217, 2.0] + - - [24, 24, 6816, 64, 24, 24, 64, 64] + - [260, 1834.0] + - - [26, 26, 6272, 64, 26, 26, 64, 64] + - [228, 1944.0] + - - [256, 128, 1, 3136, 256, 256, 3136, 3136] + - [230, 1914.0] + - - [2, 128, 1, 1024, 2, 2, 1024, 1024] + - [217, 21.0] + - - [2, 96, 1, 1024, 2, 2, 1024, 1024] + - [217, 16.0] + - - [768, 12, 1, 768, 768, 768, 768, 768] + - [233, 661.0] + - - [768, 4, 1, 768, 768, 768, 768, 768] + - [233, 229.0] + - - [256, 80, 1, 784, 256, 256, 784, 784] + - [237, 1384.0] + - - [256, 12, 1, 3800, 256, 256, 3800, 3800] + - [226, 295.0] + - - [256, 3, 1, 3800, 256, 256, 3800, 3800] + - [243, 76.0] + - - [256, 12, 1, 950, 256, 256, 950, 950] + - [233, 263.0] + - - [256, 3, 1, 950, 256, 256, 950, 950] + - [223, 66.0] + - - [256, 12, 1, 3220, 256, 256, 3220, 3220] + - [258, 300.0] + - - [256, 3, 1, 3220, 256, 256, 3220, 3220] + - [226, 75.0] + - - [256, 12, 1, 3072, 256, 256, 3072, 3072] + - [249, 299.0] + - - [256, 3, 1, 3072, 256, 256, 3072, 3072] + - [233, 76.0] + - - [256, 12, 1, 850, 256, 256, 850, 850] + - [233, 258.0] + - - [256, 3, 1, 850, 256, 256, 850, 850] + - [223, 65.0] + - - [256, 12, 1, 2852, 256, 256, 2852, 2852] + - [226, 296.0] + - - [256, 3, 1, 2852, 256, 256, 2852, 2852] + - [233, 74.0] + - - [256, 12, 1, 805, 256, 256, 805, 805] + - [233, 254.0] + - - [256, 3, 1, 805, 256, 256, 805, 805] + - [223, 64.0] + - - [256, 3, 1, 864, 256, 256, 864, 864] + - [223, 65.0] + - - [256, 3, 1, 768, 256, 256, 768, 768] + - [233, 63.0] + - - [256, 12, 1, 864, 256, 256, 864, 864] + - [233, 259.0] + - - [256, 12, 1, 768, 256, 256, 768, 768] + - [233, 253.0] + - - [256, 12, 1, 2904, 256, 256, 2904, 2904] + - [258, 297.0] + - - [256, 3, 1, 2904, 256, 256, 2904, 2904] + - [223, 74.0] + - - [256, 3, 1, 713, 256, 256, 713, 713] + - [223, 62.0] + - - [256, 12, 1, 888, 256, 256, 888, 888] + - [233, 256.0] + - - [256, 3, 1, 888, 256, 256, 888, 888] + - [223, 65.0] + - - [256, 12, 1, 713, 256, 256, 713, 713] + - [233, 248.0] + - - [256, 3, 1, 660, 256, 256, 660, 660] + - [223, 61.0] + - - [256, 3, 1, 672, 256, 256, 672, 672] + - [256, 62.0] + - - [256, 12, 1, 660, 256, 256, 660, 660] + - [233, 245.0] + - - [256, 3, 1, 726, 256, 256, 726, 726] + - [223, 62.0] + - - [256, 12, 1, 672, 256, 256, 672, 672] + - [233, 242.0] + - - [256, 3, 1, 247, 256, 256, 247, 247] + - [223, 43.0] + - - [256, 12, 1, 726, 256, 256, 726, 726] + - [233, 247.0] + - - [256, 3, 1, 216, 256, 256, 216, 216] + - [223, 40.0] + - - [256, 3, 1, 3400, 256, 256, 3400, 3400] + - [223, 75.0] + - - [256, 3, 1, 221, 256, 256, 221, 221] + - [223, 41.0] + - - [256, 12, 1, 3552, 256, 256, 3552, 3552] + - [226, 302.0] + - - [256, 3, 1, 3456, 256, 256, 3456, 3456] + - [223, 76.0] + - - [256, 3, 1, 204, 256, 256, 204, 204] + - [223, 38.0] + - - [256, 12, 1, 3400, 256, 256, 3400, 3400] + - [226, 300.0] + - - [256, 12, 1, 3456, 256, 256, 3456, 3456] + - [258, 302.0] + - - [256, 12, 1, 221, 256, 256, 221, 221] + - [233, 163.0] + - - [256, 3, 1, 3552, 256, 256, 3552, 3552] + - [223, 76.0] + - - [256, 3, 1, 228, 256, 256, 228, 228] + - [223, 42.0] + - - [256, 3, 1, 234, 256, 256, 234, 234] + - [223, 42.0] + - - [256, 12, 1, 234, 256, 256, 234, 234] + - [233, 166.0] + - - [81, 1024, 1, 1024, 81, 81, 1024, 1024] + - [245, 2410.0] + - - [81, 1000, 1, 1024, 81, 81, 1024, 1024] + - [245, 2343.0] + - - [256, 12, 1, 228, 256, 256, 228, 228] + - [233, 166.0] + - - [256, 3, 1, 252, 256, 256, 252, 252] + - [223, 44.0] + - - [256, 12, 1, 252, 256, 256, 252, 252] + - [233, 174.0] + - - [256, 12, 1, 247, 256, 256, 247, 247] + - [233, 174.0] + - - [1024, 6, 1, 2, 1024, 1024, 2, 2] + - [219, 9.0] + - - [2, 8, 1, 2048, 2, 2, 2048, 2048] + - [217, 1.0] + - - [2, 20, 1, 1024, 2, 2, 1024, 1024] + - [217, 3.0] + - - [2, 2, 1, 2560, 2, 2, 2560, 2560] + - [249, 0.4] +- null +- null +- DeviceEfficiency +...